mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Merge branch 'develop' into feature/multi-communicator
This commit is contained in:
		
							
								
								
									
										17
									
								
								TODO
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								TODO
									
									
									
									
									
								
							@@ -2,19 +2,16 @@ TODO:
 | 
			
		||||
---------------
 | 
			
		||||
 | 
			
		||||
Large item work list:
 | 
			
		||||
1)- I/O;    There appear to be issues with MPI IO and NERSC with large files.
 | 
			
		||||
            Possible 2GB limit reappeared. GPFS driver in Intel MPI.
 | 
			
		||||
 | 
			
		||||
2)- BG/Q port and check
 | 
			
		||||
1)- BG/Q port and check
 | 
			
		||||
2)- Christoph's local basis expansion Lanczos
 | 
			
		||||
3)- Precision conversion and sort out localConvert      <-- partial
 | 
			
		||||
 | 
			
		||||
3)- Christoph's local basis expansion Lanczos; port to use Lattice_transfer features
 | 
			
		||||
 | 
			
		||||
4)- Precision conversion and sort out localConvert      <-- partial
 | 
			
		||||
  - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
 | 
			
		||||
5)- Physical propagator interface
 | 
			
		||||
6)- Conserved currents
 | 
			
		||||
7)- Multigrid Wilson and DWF, compare to other Multigrid implementations
 | 
			
		||||
8)- HDCR resume
 | 
			
		||||
4)- Physical propagator interface
 | 
			
		||||
5)- Conserved currents
 | 
			
		||||
6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
 | 
			
		||||
7)- HDCR resume
 | 
			
		||||
 | 
			
		||||
Recent DONE 
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										518
									
								
								benchmarks/Benchmark_ITT.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										518
									
								
								benchmarks/Benchmark_ITT.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,518 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
struct time_statistics{
 | 
			
		||||
  double mean;
 | 
			
		||||
  double err;
 | 
			
		||||
  double min;
 | 
			
		||||
  double max;
 | 
			
		||||
 | 
			
		||||
  void statistics(std::vector<double> v){
 | 
			
		||||
      double sum = std::accumulate(v.begin(), v.end(), 0.0);
 | 
			
		||||
      mean = sum / v.size();
 | 
			
		||||
 | 
			
		||||
      std::vector<double> diff(v.size());
 | 
			
		||||
      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
 | 
			
		||||
      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
 | 
			
		||||
      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
 | 
			
		||||
 | 
			
		||||
      auto result = std::minmax_element(v.begin(), v.end());
 | 
			
		||||
      min = *result.first;
 | 
			
		||||
      max = *result.second;
 | 
			
		||||
}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void comms_header(){
 | 
			
		||||
  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
 | 
			
		||||
            <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
Gamma::Algebra Gmu [] = {
 | 
			
		||||
  Gamma::Algebra::GammaX,
 | 
			
		||||
  Gamma::Algebra::GammaY,
 | 
			
		||||
  Gamma::Algebra::GammaZ,
 | 
			
		||||
  Gamma::Algebra::GammaT
 | 
			
		||||
};
 | 
			
		||||
struct controls {
 | 
			
		||||
  int Opt;
 | 
			
		||||
  int CommsOverlap;
 | 
			
		||||
  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
 | 
			
		||||
  //  int HugePages;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class Benchmark {
 | 
			
		||||
public:
 | 
			
		||||
  static void Decomposition (void ) {
 | 
			
		||||
 | 
			
		||||
    int threads = GridThread::GetThreads();
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvReal          : "<<sizeof(vReal )*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvComplex       : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static void Comms(void)
 | 
			
		||||
  {
 | 
			
		||||
    int Nloop=100;
 | 
			
		||||
    int nmu=0;
 | 
			
		||||
    int maxlat=32;
 | 
			
		||||
 | 
			
		||||
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
 | 
			
		||||
    std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
 | 
			
		||||
    std::vector<double> t_time(Nloop);
 | 
			
		||||
    time_statistics timestat;
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
    comms_header();
 | 
			
		||||
 | 
			
		||||
    for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
      for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
	std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
	      lat*mpi_layout[1],
 | 
			
		||||
	      lat*mpi_layout[2],
 | 
			
		||||
	      lat*mpi_layout[3]});
 | 
			
		||||
 | 
			
		||||
	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
	RealD Nrank = Grid._Nprocessors;
 | 
			
		||||
	RealD Nnode = Grid.NodeCount();
 | 
			
		||||
	RealD ppn = Nrank/Nnode;
 | 
			
		||||
 | 
			
		||||
	std::vector<HalfSpinColourVectorD *> xbuf(8);
 | 
			
		||||
	std::vector<HalfSpinColourVectorD *> rbuf(8);
 | 
			
		||||
	Grid.ShmBufferFreeAll();
 | 
			
		||||
	for(int d=0;d<8;d++){
 | 
			
		||||
	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	int ncomm;
 | 
			
		||||
	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 | 
			
		||||
	double dbytes;
 | 
			
		||||
	for(int i=0;i<Nloop;i++){
 | 
			
		||||
	  double start=usecond();
 | 
			
		||||
 | 
			
		||||
	  std::vector<CartesianCommunicator::CommsRequest_t> requests;
 | 
			
		||||
	  dbytes=0;
 | 
			
		||||
	  ncomm=0;
 | 
			
		||||
 | 
			
		||||
	  parallel_for(int dir=0;dir<8;dir++){
 | 
			
		||||
 | 
			
		||||
	    double tbytes;
 | 
			
		||||
	    int mu =dir % 4;
 | 
			
		||||
 | 
			
		||||
	    if (mpi_layout[mu]>1 ) {
 | 
			
		||||
	        
 | 
			
		||||
	      ncomm++;
 | 
			
		||||
	      int xmit_to_rank;
 | 
			
		||||
	      int recv_from_rank;
 | 
			
		||||
	      if ( dir == mu ) { 
 | 
			
		||||
		int comm_proc=1;
 | 
			
		||||
		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 | 
			
		||||
	      } else { 
 | 
			
		||||
		int comm_proc = mpi_layout[mu]-1;
 | 
			
		||||
		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 | 
			
		||||
	      }
 | 
			
		||||
#if 1
 | 
			
		||||
	      tbytes= Grid.StencilSendToRecvFromBegin(requests,
 | 
			
		||||
						      (void *)&xbuf[dir][0],
 | 
			
		||||
						      xmit_to_rank,
 | 
			
		||||
						      (void *)&rbuf[dir][0],
 | 
			
		||||
						      recv_from_rank,
 | 
			
		||||
						      bytes,dir);
 | 
			
		||||
	      Grid.StencilSendToRecvFromComplete(requests,dir);
 | 
			
		||||
#endif
 | 
			
		||||
	      requests.resize(0);
 | 
			
		||||
 | 
			
		||||
#pragma omp atomic
 | 
			
		||||
	      dbytes+=tbytes;
 | 
			
		||||
	    }
 | 
			
		||||
	  }
 | 
			
		||||
	  Grid.Barrier();
 | 
			
		||||
	  double stop=usecond();
 | 
			
		||||
	  t_time[i] = stop-start; // microseconds
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	timestat.statistics(t_time);
 | 
			
		||||
 | 
			
		||||
	dbytes=dbytes*ppn;
 | 
			
		||||
	double xbytes    = dbytes*0.5;
 | 
			
		||||
	double rbytes    = dbytes*0.5;
 | 
			
		||||
	double bidibytes = dbytes;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
 | 
			
		||||
		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
 | 
			
		||||
		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
 | 
			
		||||
		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
 | 
			
		||||
		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 | 
			
		||||
		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
      }
 | 
			
		||||
    }    
 | 
			
		||||
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static void Memory(void)
 | 
			
		||||
  {
 | 
			
		||||
    const int Nvec=8;
 | 
			
		||||
    typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
 | 
			
		||||
    typedef iVector<vReal,Nvec> Vec;
 | 
			
		||||
 | 
			
		||||
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
 | 
			
		||||
    std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 | 
			
		||||
  
 | 
			
		||||
  uint64_t lmax=48;
 | 
			
		||||
#define NLOOP (10*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
 | 
			
		||||
 | 
			
		||||
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
    for(int lat=8;lat<=lmax;lat+=4){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      Vec rn ; random(sRNG,rn);
 | 
			
		||||
 | 
			
		||||
      LatticeVec z(&Grid); z=rn;
 | 
			
		||||
      LatticeVec x(&Grid); x=rn;
 | 
			
		||||
      LatticeVec y(&Grid); y=rn;
 | 
			
		||||
      double a=2.0;
 | 
			
		||||
 | 
			
		||||
      uint64_t Nloop=NLOOP;
 | 
			
		||||
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
	z=a*x-y;
 | 
			
		||||
        x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
 | 
			
		||||
        y._odata[4]=z._odata[4];
 | 
			
		||||
      }
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
     
 | 
			
		||||
      double flops=vol*Nvec*2;// mul,add
 | 
			
		||||
      double bytes=3.0*vol*Nvec*sizeof(Real);
 | 
			
		||||
      std::cout<<GridLogMessage<<std::setprecision(3) 
 | 
			
		||||
	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  static void DWF(int Ls,int L)
 | 
			
		||||
  {
 | 
			
		||||
    RealD mass=0.1;
 | 
			
		||||
    RealD M5  =1.8;
 | 
			
		||||
 | 
			
		||||
    double mflops;
 | 
			
		||||
    double mflops_best = 0;
 | 
			
		||||
    double mflops_worst= 0;
 | 
			
		||||
 | 
			
		||||
    ///////////////////////////////////////////////////////
 | 
			
		||||
    // Set/Get the layout & grid size
 | 
			
		||||
    ///////////////////////////////////////////////////////
 | 
			
		||||
    int threads = GridThread::GetThreads();
 | 
			
		||||
    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
 | 
			
		||||
    std::vector<int> local({L,L,L,L});
 | 
			
		||||
 | 
			
		||||
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
 | 
			
		||||
								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    uint64_t NP = TmpGrid->RankCount();
 | 
			
		||||
    uint64_t NN = TmpGrid->NodeCount();
 | 
			
		||||
    uint64_t SHM=NP/NN;
 | 
			
		||||
 | 
			
		||||
    std::vector<int> internal;
 | 
			
		||||
    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
 | 
			
		||||
    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
 | 
			
		||||
    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
 | 
			
		||||
    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
 | 
			
		||||
    else assert(0);
 | 
			
		||||
 | 
			
		||||
    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
 | 
			
		||||
    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
 | 
			
		||||
 | 
			
		||||
    ///////// Welcome message ////////////
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ///////// Lattice Init ////////////
 | 
			
		||||
    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    ///////// RNG Init ////////////
 | 
			
		||||
    std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
    std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 | 
			
		||||
 | 
			
		||||
    ///////// Source preparation ////////////
 | 
			
		||||
    LatticeFermion src   (FGrid); random(RNG5,src);
 | 
			
		||||
    LatticeFermion ref   (FGrid);
 | 
			
		||||
    LatticeFermion tmp   (FGrid);
 | 
			
		||||
 | 
			
		||||
    RealD N2 = 1.0/::sqrt(norm2(src));
 | 
			
		||||
    src = src*N2;
 | 
			
		||||
    
 | 
			
		||||
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
 | 
			
		||||
 | 
			
		||||
    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////
 | 
			
		||||
    // Naive wilson implementation
 | 
			
		||||
    ////////////////////////////////////
 | 
			
		||||
    {
 | 
			
		||||
      LatticeGaugeField Umu5d(FGrid); 
 | 
			
		||||
      std::vector<LatticeColourMatrix> U(4,FGrid);
 | 
			
		||||
      for(int ss=0;ss<Umu._grid->oSites();ss++){
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      ref = zero;
 | 
			
		||||
      for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
 | 
			
		||||
      }
 | 
			
		||||
      for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
	
 | 
			
		||||
	tmp = U[mu]*Cshift(src,mu+1,1);
 | 
			
		||||
	ref=ref + tmp - Gamma(Gmu[mu])*tmp;
 | 
			
		||||
	
 | 
			
		||||
	tmp =adj(U[mu])*src;
 | 
			
		||||
	tmp =Cshift(tmp,mu+1,-1);
 | 
			
		||||
	ref=ref + tmp + Gamma(Gmu[mu])*tmp;
 | 
			
		||||
      }
 | 
			
		||||
      ref = -0.5*ref;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LatticeFermion src_e (FrbGrid);
 | 
			
		||||
    LatticeFermion src_o (FrbGrid);
 | 
			
		||||
    LatticeFermion r_e   (FrbGrid);
 | 
			
		||||
    LatticeFermion r_o   (FrbGrid);
 | 
			
		||||
    LatticeFermion r_eo  (FGrid);
 | 
			
		||||
    LatticeFermion err   (FGrid);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
      pickCheckerboard(Even,src_e,src);
 | 
			
		||||
      pickCheckerboard(Odd,src_o,src);
 | 
			
		||||
 | 
			
		||||
#if defined(AVX512) 
 | 
			
		||||
      const int num_cases = 6;
 | 
			
		||||
#else
 | 
			
		||||
      const int num_cases = 4;
 | 
			
		||||
#endif
 | 
			
		||||
      controls Cases [] = {
 | 
			
		||||
#if defined(AVX512) 
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
#endif
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
 | 
			
		||||
      }; 
 | 
			
		||||
 | 
			
		||||
      for(int c=0;c<num_cases;c++) {
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 | 
			
		||||
	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 | 
			
		||||
	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 | 
			
		||||
	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 | 
			
		||||
	int nwarm = 10;
 | 
			
		||||
	double t0=usecond();
 | 
			
		||||
	FGrid->Barrier();
 | 
			
		||||
	for(int i=0;i<nwarm;i++){
 | 
			
		||||
	  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	}
 | 
			
		||||
	FGrid->Barrier();
 | 
			
		||||
	double t1=usecond();
 | 
			
		||||
	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
 | 
			
		||||
	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 | 
			
		||||
 | 
			
		||||
	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 | 
			
		||||
	Dw.ZeroCounters();
 | 
			
		||||
 | 
			
		||||
	time_statistics timestat;
 | 
			
		||||
	std::vector<double> t_time(ncall);
 | 
			
		||||
	for(uint64_t i=0;i<ncall;i++){
 | 
			
		||||
	  t0=usecond();
 | 
			
		||||
	  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	  t1=usecond();
 | 
			
		||||
	  t_time[i] = t1-t0;
 | 
			
		||||
	}
 | 
			
		||||
	FGrid->Barrier();
 | 
			
		||||
	
 | 
			
		||||
	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
	double flops=(1344.0*volume)/2;
 | 
			
		||||
	double mf_hi, mf_lo, mf_err;
 | 
			
		||||
 | 
			
		||||
	timestat.statistics(t_time);
 | 
			
		||||
	mf_hi = flops/timestat.min;
 | 
			
		||||
	mf_lo = flops/timestat.max;
 | 
			
		||||
	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 | 
			
		||||
 | 
			
		||||
	mflops = flops/timestat.mean;
 | 
			
		||||
 | 
			
		||||
	if ( mflops_best == 0   ) mflops_best = mflops;
 | 
			
		||||
	if ( mflops_worst== 0   ) mflops_worst= mflops;
 | 
			
		||||
	if ( mflops>mflops_best ) mflops_best = mflops;
 | 
			
		||||
	if ( mflops<mflops_worst) mflops_worst= mflops;
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 | 
			
		||||
 | 
			
		||||
	Dw.Report();
 | 
			
		||||
 | 
			
		||||
	Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	Dw.DhopOE(src_e,r_o,DaggerNo);
 | 
			
		||||
	setCheckerboard(r_eo,r_o);
 | 
			
		||||
	setCheckerboard(r_eo,r_e);
 | 
			
		||||
	err = r_eo-ref; 
 | 
			
		||||
	std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
	assert((norm2(err)<1.0e-4));
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
 | 
			
		||||
  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
 | 
			
		||||
 | 
			
		||||
  Benchmark::Decomposition();
 | 
			
		||||
 | 
			
		||||
  int do_memory=1;
 | 
			
		||||
  int do_comms =1;
 | 
			
		||||
  int do_su3   =0;
 | 
			
		||||
  int do_wilson=1;
 | 
			
		||||
  int do_dwf   =1;
 | 
			
		||||
 | 
			
		||||
  if ( do_memory ) {
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    Benchmark::Memory();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_comms ) {
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    Benchmark::Comms();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_su3 ) {
 | 
			
		||||
    // empty for now
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_wilson ) {
 | 
			
		||||
    int Ls=1;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    Benchmark::DWF(Ls,16);
 | 
			
		||||
    Benchmark::DWF(Ls,24);
 | 
			
		||||
    Benchmark::DWF(Ls,32);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_dwf ) {
 | 
			
		||||
    int Ls=16;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    Benchmark::DWF(Ls,8);
 | 
			
		||||
    Benchmark::DWF(Ls,12);
 | 
			
		||||
    Benchmark::DWF(Ls,16);
 | 
			
		||||
    Benchmark::DWF(Ls,24);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
@@ -11,7 +11,7 @@ int PointerCache::victim;
 | 
			
		||||
 | 
			
		||||
void *PointerCache::Insert(void *ptr,size_t bytes) {
 | 
			
		||||
 | 
			
		||||
  if (bytes < 4096 ) return NULL;
 | 
			
		||||
  if (bytes < 4096 ) return ptr;
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  assert(omp_in_parallel()==0);
 | 
			
		||||
 
 | 
			
		||||
@@ -185,17 +185,18 @@ public:
 | 
			
		||||
    ////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
    void show_decomposition(){
 | 
			
		||||
      std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Outer strides      : " << _ostride << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Inner strides      : " << _istride << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "iSites             : " << _isites << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "oSites             : " << _osites << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;        
 | 
			
		||||
      std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;             
 | 
			
		||||
      std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;        
 | 
			
		||||
      std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;             
 | 
			
		||||
    } 
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -62,77 +62,81 @@ public:
 | 
			
		||||
      return shift;
 | 
			
		||||
    }
 | 
			
		||||
    GridCartesian(const std::vector<int> &dimensions,
 | 
			
		||||
		  const std::vector<int> &simd_layout,
 | 
			
		||||
		  const std::vector<int> &processor_grid
 | 
			
		||||
		  ) : GridBase(processor_grid)
 | 
			
		||||
                  const std::vector<int> &simd_layout,
 | 
			
		||||
                  const std::vector<int> &processor_grid) : GridBase(processor_grid)
 | 
			
		||||
    {
 | 
			
		||||
        ///////////////////////
 | 
			
		||||
        // Grid information
 | 
			
		||||
        ///////////////////////
 | 
			
		||||
        _ndimension = dimensions.size();
 | 
			
		||||
            
 | 
			
		||||
        _fdimensions.resize(_ndimension);
 | 
			
		||||
        _gdimensions.resize(_ndimension);
 | 
			
		||||
        _ldimensions.resize(_ndimension);
 | 
			
		||||
        _rdimensions.resize(_ndimension);
 | 
			
		||||
        _simd_layout.resize(_ndimension);
 | 
			
		||||
	_lstart.resize(_ndimension);
 | 
			
		||||
	_lend.resize(_ndimension);
 | 
			
		||||
            
 | 
			
		||||
        _ostride.resize(_ndimension);
 | 
			
		||||
        _istride.resize(_ndimension);
 | 
			
		||||
            
 | 
			
		||||
        _fsites = _gsites = _osites = _isites = 1;
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      // Grid information
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      _ndimension = dimensions.size();
 | 
			
		||||
 | 
			
		||||
        for(int d=0;d<_ndimension;d++){
 | 
			
		||||
	  _fdimensions[d] = dimensions[d]; // Global dimensions
 | 
			
		||||
	  _gdimensions[d] = _fdimensions[d]; // Global dimensions
 | 
			
		||||
	  _simd_layout[d] = simd_layout[d];
 | 
			
		||||
	  _fsites = _fsites * _fdimensions[d];
 | 
			
		||||
	  _gsites = _gsites * _gdimensions[d];
 | 
			
		||||
      _fdimensions.resize(_ndimension);
 | 
			
		||||
      _gdimensions.resize(_ndimension);
 | 
			
		||||
      _ldimensions.resize(_ndimension);
 | 
			
		||||
      _rdimensions.resize(_ndimension);
 | 
			
		||||
      _simd_layout.resize(_ndimension);
 | 
			
		||||
      _lstart.resize(_ndimension);
 | 
			
		||||
      _lend.resize(_ndimension);
 | 
			
		||||
 | 
			
		||||
	  //FIXME check for exact division
 | 
			
		||||
      _ostride.resize(_ndimension);
 | 
			
		||||
      _istride.resize(_ndimension);
 | 
			
		||||
 | 
			
		||||
	  // Use a reduced simd grid
 | 
			
		||||
	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
 | 
			
		||||
	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
 | 
			
		||||
	  _lstart[d]     = _processor_coor[d]*_ldimensions[d];
 | 
			
		||||
	  _lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
 | 
			
		||||
	  _osites  *= _rdimensions[d];
 | 
			
		||||
	  _isites  *= _simd_layout[d];
 | 
			
		||||
                
 | 
			
		||||
	  // Addressing support
 | 
			
		||||
	  if ( d==0 ) {
 | 
			
		||||
	    _ostride[d] = 1;
 | 
			
		||||
	    _istride[d] = 1;
 | 
			
		||||
	  } else {
 | 
			
		||||
	    _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
 | 
			
		||||
	    _istride[d] = _istride[d-1]*_simd_layout[d-1];
 | 
			
		||||
	  }
 | 
			
		||||
      _fsites = _gsites = _osites = _isites = 1;
 | 
			
		||||
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        _fdimensions[d] = dimensions[d];   // Global dimensions
 | 
			
		||||
        _gdimensions[d] = _fdimensions[d]; // Global dimensions
 | 
			
		||||
        _simd_layout[d] = simd_layout[d];
 | 
			
		||||
        _fsites = _fsites * _fdimensions[d];
 | 
			
		||||
        _gsites = _gsites * _gdimensions[d];
 | 
			
		||||
 | 
			
		||||
        // Use a reduced simd grid
 | 
			
		||||
        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
 | 
			
		||||
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
 | 
			
		||||
 | 
			
		||||
        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
 | 
			
		||||
        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
 | 
			
		||||
 | 
			
		||||
        _lstart[d] = _processor_coor[d] * _ldimensions[d];
 | 
			
		||||
        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
 | 
			
		||||
        _osites *= _rdimensions[d];
 | 
			
		||||
        _isites *= _simd_layout[d];
 | 
			
		||||
 | 
			
		||||
        // Addressing support
 | 
			
		||||
        if (d == 0)
 | 
			
		||||
        {
 | 
			
		||||
          _ostride[d] = 1;
 | 
			
		||||
          _istride[d] = 1;
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        ///////////////////////
 | 
			
		||||
        // subplane information
 | 
			
		||||
        ///////////////////////
 | 
			
		||||
        _slice_block.resize(_ndimension);
 | 
			
		||||
        _slice_stride.resize(_ndimension);
 | 
			
		||||
        _slice_nblock.resize(_ndimension);
 | 
			
		||||
            
 | 
			
		||||
        int block =1;
 | 
			
		||||
        int nblock=1;
 | 
			
		||||
        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
 | 
			
		||||
            
 | 
			
		||||
        for(int d=0;d<_ndimension;d++){
 | 
			
		||||
            nblock/=_rdimensions[d];
 | 
			
		||||
            _slice_block[d] =block;
 | 
			
		||||
            _slice_stride[d]=_ostride[d]*_rdimensions[d];
 | 
			
		||||
            _slice_nblock[d]=nblock;
 | 
			
		||||
            block = block*_rdimensions[d];
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
 | 
			
		||||
          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      // subplane information
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      _slice_block.resize(_ndimension);
 | 
			
		||||
      _slice_stride.resize(_ndimension);
 | 
			
		||||
      _slice_nblock.resize(_ndimension);
 | 
			
		||||
 | 
			
		||||
      int block = 1;
 | 
			
		||||
      int nblock = 1;
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
        nblock *= _rdimensions[d];
 | 
			
		||||
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        nblock /= _rdimensions[d];
 | 
			
		||||
        _slice_block[d] = block;
 | 
			
		||||
        _slice_stride[d] = _ostride[d] * _rdimensions[d];
 | 
			
		||||
        _slice_nblock[d] = nblock;
 | 
			
		||||
        block = block * _rdimensions[d];
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -131,21 +131,21 @@ public:
 | 
			
		||||
      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
 | 
			
		||||
    }
 | 
			
		||||
    void Init(const std::vector<int> &dimensions,
 | 
			
		||||
	      const std::vector<int> &simd_layout,
 | 
			
		||||
	      const std::vector<int> &processor_grid,
 | 
			
		||||
	      const std::vector<int> &checker_dim_mask,
 | 
			
		||||
	      int checker_dim)
 | 
			
		||||
              const std::vector<int> &simd_layout,
 | 
			
		||||
              const std::vector<int> &processor_grid,
 | 
			
		||||
              const std::vector<int> &checker_dim_mask,
 | 
			
		||||
              int checker_dim)
 | 
			
		||||
    {
 | 
			
		||||
    ///////////////////////
 | 
			
		||||
    // Grid information
 | 
			
		||||
    ///////////////////////
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      // Grid information
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      _checker_dim = checker_dim;
 | 
			
		||||
      assert(checker_dim_mask[checker_dim]==1);
 | 
			
		||||
      assert(checker_dim_mask[checker_dim] == 1);
 | 
			
		||||
      _ndimension = dimensions.size();
 | 
			
		||||
      assert(checker_dim_mask.size()==_ndimension);
 | 
			
		||||
      assert(processor_grid.size()==_ndimension);
 | 
			
		||||
      assert(simd_layout.size()==_ndimension);
 | 
			
		||||
      
 | 
			
		||||
      assert(checker_dim_mask.size() == _ndimension);
 | 
			
		||||
      assert(processor_grid.size() == _ndimension);
 | 
			
		||||
      assert(simd_layout.size() == _ndimension);
 | 
			
		||||
 | 
			
		||||
      _fdimensions.resize(_ndimension);
 | 
			
		||||
      _gdimensions.resize(_ndimension);
 | 
			
		||||
      _ldimensions.resize(_ndimension);
 | 
			
		||||
@@ -153,114 +153,133 @@ public:
 | 
			
		||||
      _simd_layout.resize(_ndimension);
 | 
			
		||||
      _lstart.resize(_ndimension);
 | 
			
		||||
      _lend.resize(_ndimension);
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
      _ostride.resize(_ndimension);
 | 
			
		||||
      _istride.resize(_ndimension);
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
      _fsites = _gsites = _osites = _isites = 1;
 | 
			
		||||
	
 | 
			
		||||
      _checker_dim_mask=checker_dim_mask;
 | 
			
		||||
 | 
			
		||||
      for(int d=0;d<_ndimension;d++){
 | 
			
		||||
	_fdimensions[d] = dimensions[d];
 | 
			
		||||
	_gdimensions[d] = _fdimensions[d];
 | 
			
		||||
	_fsites = _fsites * _fdimensions[d];
 | 
			
		||||
	_gsites = _gsites * _gdimensions[d];
 | 
			
		||||
        
 | 
			
		||||
	if (d==_checker_dim) {
 | 
			
		||||
	  _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
 | 
			
		||||
	}
 | 
			
		||||
	_ldimensions[d] = _gdimensions[d]/_processors[d];
 | 
			
		||||
	_lstart[d]     = _processor_coor[d]*_ldimensions[d];
 | 
			
		||||
	_lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
 | 
			
		||||
      _checker_dim_mask = checker_dim_mask;
 | 
			
		||||
 | 
			
		||||
	// Use a reduced simd grid
 | 
			
		||||
	_simd_layout[d] = simd_layout[d];
 | 
			
		||||
	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
 | 
			
		||||
	assert(_rdimensions[d]>0);
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        _fdimensions[d] = dimensions[d];
 | 
			
		||||
        _gdimensions[d] = _fdimensions[d];
 | 
			
		||||
        _fsites = _fsites * _fdimensions[d];
 | 
			
		||||
        _gsites = _gsites * _gdimensions[d];
 | 
			
		||||
 | 
			
		||||
	// all elements of a simd vector must have same checkerboard.
 | 
			
		||||
	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
 | 
			
		||||
	if ( _simd_layout[d]>1 ) {
 | 
			
		||||
	  if ( checker_dim_mask[d] ) { 
 | 
			
		||||
	    assert( (_rdimensions[d]&0x1) == 0 );
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
        if (d == _checker_dim)
 | 
			
		||||
        {
 | 
			
		||||
          assert((_gdimensions[d] & 0x1) == 0);
 | 
			
		||||
          _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
 | 
			
		||||
        }
 | 
			
		||||
        _ldimensions[d] = _gdimensions[d] / _processors[d];
 | 
			
		||||
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
 | 
			
		||||
        _lstart[d] = _processor_coor[d] * _ldimensions[d];
 | 
			
		||||
        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
 | 
			
		||||
 | 
			
		||||
	_osites *= _rdimensions[d];
 | 
			
		||||
	_isites *= _simd_layout[d];
 | 
			
		||||
        
 | 
			
		||||
	// Addressing support
 | 
			
		||||
	if ( d==0 ) {
 | 
			
		||||
	  _ostride[d] = 1;
 | 
			
		||||
	  _istride[d] = 1;
 | 
			
		||||
	} else {
 | 
			
		||||
	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
 | 
			
		||||
	  _istride[d] = _istride[d-1]*_simd_layout[d-1];
 | 
			
		||||
	}
 | 
			
		||||
        // Use a reduced simd grid
 | 
			
		||||
        _simd_layout[d] = simd_layout[d];
 | 
			
		||||
        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
 | 
			
		||||
        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
 | 
			
		||||
        assert(_rdimensions[d] > 0);
 | 
			
		||||
 | 
			
		||||
        // all elements of a simd vector must have same checkerboard.
 | 
			
		||||
        // If Ls vectorised, this must still be the case; e.g. dwf rb5d
 | 
			
		||||
        if (_simd_layout[d] > 1)
 | 
			
		||||
        {
 | 
			
		||||
          if (checker_dim_mask[d])
 | 
			
		||||
          {
 | 
			
		||||
            assert((_rdimensions[d] & 0x1) == 0);
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        _osites *= _rdimensions[d];
 | 
			
		||||
        _isites *= _simd_layout[d];
 | 
			
		||||
 | 
			
		||||
        // Addressing support
 | 
			
		||||
        if (d == 0)
 | 
			
		||||
        {
 | 
			
		||||
          _ostride[d] = 1;
 | 
			
		||||
          _istride[d] = 1;
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
 | 
			
		||||
          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
            
 | 
			
		||||
 | 
			
		||||
      ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
      // subplane information
 | 
			
		||||
      ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
      _slice_block.resize(_ndimension);
 | 
			
		||||
      _slice_stride.resize(_ndimension);
 | 
			
		||||
      _slice_nblock.resize(_ndimension);
 | 
			
		||||
        
 | 
			
		||||
      int block =1;
 | 
			
		||||
      int nblock=1;
 | 
			
		||||
      for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
 | 
			
		||||
      
 | 
			
		||||
      for(int d=0;d<_ndimension;d++){
 | 
			
		||||
	nblock/=_rdimensions[d];
 | 
			
		||||
	_slice_block[d] =block;
 | 
			
		||||
	_slice_stride[d]=_ostride[d]*_rdimensions[d];
 | 
			
		||||
	_slice_nblock[d]=nblock;
 | 
			
		||||
	block = block*_rdimensions[d];
 | 
			
		||||
 | 
			
		||||
      int block = 1;
 | 
			
		||||
      int nblock = 1;
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
        nblock *= _rdimensions[d];
 | 
			
		||||
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        nblock /= _rdimensions[d];
 | 
			
		||||
        _slice_block[d] = block;
 | 
			
		||||
        _slice_stride[d] = _ostride[d] * _rdimensions[d];
 | 
			
		||||
        _slice_nblock[d] = nblock;
 | 
			
		||||
        block = block * _rdimensions[d];
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      ////////////////////////////////////////////////
 | 
			
		||||
      // Create a checkerboard lookup table
 | 
			
		||||
      ////////////////////////////////////////////////
 | 
			
		||||
      int rvol = 1;
 | 
			
		||||
      for(int d=0;d<_ndimension;d++){
 | 
			
		||||
	rvol=rvol * _rdimensions[d];
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        rvol = rvol * _rdimensions[d];
 | 
			
		||||
      }
 | 
			
		||||
      _checker_board.resize(rvol);
 | 
			
		||||
      for(int osite=0;osite<_osites;osite++){
 | 
			
		||||
	_checker_board[osite] = CheckerBoardFromOindex (osite);
 | 
			
		||||
      for (int osite = 0; osite < _osites; osite++)
 | 
			
		||||
      {
 | 
			
		||||
        _checker_board[osite] = CheckerBoardFromOindex(osite);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
    };
 | 
			
		||||
protected:
 | 
			
		||||
 | 
			
		||||
  protected:
 | 
			
		||||
    virtual int oIndex(std::vector<int> &coor)
 | 
			
		||||
    {
 | 
			
		||||
      int idx=0;
 | 
			
		||||
      for(int d=0;d<_ndimension;d++) {
 | 
			
		||||
	if( d==_checker_dim ) {
 | 
			
		||||
	  idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]);
 | 
			
		||||
	} else {
 | 
			
		||||
	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
 | 
			
		||||
	}
 | 
			
		||||
      int idx = 0;
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        if (d == _checker_dim)
 | 
			
		||||
        {
 | 
			
		||||
          idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          idx += _ostride[d] * (coor[d] % _rdimensions[d]);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      return idx;
 | 
			
		||||
    };
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
    virtual int iIndex(std::vector<int> &lcoor)
 | 
			
		||||
    {
 | 
			
		||||
        int idx=0;
 | 
			
		||||
        for(int d=0;d<_ndimension;d++) {
 | 
			
		||||
	  if( d==_checker_dim ) {
 | 
			
		||||
	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
 | 
			
		||||
	  } else { 
 | 
			
		||||
	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
        return idx;
 | 
			
		||||
      int idx = 0;
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        if (d == _checker_dim)
 | 
			
		||||
        {
 | 
			
		||||
          idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      return idx;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -98,35 +98,39 @@ class BinaryIO {
 | 
			
		||||
 | 
			
		||||
    NerscChecksum(grid,scalardata,nersc_csum);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum)
 | 
			
		||||
 | 
			
		||||
  template <class fobj>
 | 
			
		||||
  static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
 | 
			
		||||
  {
 | 
			
		||||
    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
 | 
			
		||||
    const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    uint64_t lsites              =grid->lSites();
 | 
			
		||||
    if (fbuf.size()==1) {
 | 
			
		||||
      lsites=1;
 | 
			
		||||
    uint64_t lsites = grid->lSites();
 | 
			
		||||
    if (fbuf.size() == 1)
 | 
			
		||||
    {
 | 
			
		||||
      lsites = 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel
 | 
			
		||||
    { 
 | 
			
		||||
      uint32_t nersc_csum_thr=0;
 | 
			
		||||
    #pragma omp parallel
 | 
			
		||||
    {
 | 
			
		||||
      uint32_t nersc_csum_thr = 0;
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(uint64_t local_site=0;local_site<lsites;local_site++){
 | 
			
		||||
	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
 | 
			
		||||
	for(uint64_t j=0;j<size32;j++){
 | 
			
		||||
	  nersc_csum_thr=nersc_csum_thr+site_buf[j];
 | 
			
		||||
	}
 | 
			
		||||
      #pragma omp for
 | 
			
		||||
      for (uint64_t local_site = 0; local_site < lsites; local_site++)
 | 
			
		||||
      {
 | 
			
		||||
        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
 | 
			
		||||
        for (uint64_t j = 0; j < size32; j++)
 | 
			
		||||
        {
 | 
			
		||||
          nersc_csum_thr = nersc_csum_thr + site_buf[j];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
#pragma omp critical
 | 
			
		||||
      #pragma omp critical
 | 
			
		||||
      {
 | 
			
		||||
	nersc_csum  += nersc_csum_thr;
 | 
			
		||||
        nersc_csum += nersc_csum_thr;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
 | 
			
		||||
  {
 | 
			
		||||
    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
 | 
			
		||||
@@ -266,7 +270,7 @@ class BinaryIO {
 | 
			
		||||
    grid->Barrier();
 | 
			
		||||
    GridStopWatch timer; 
 | 
			
		||||
    GridStopWatch bstimer;
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    nersc_csum=0;
 | 
			
		||||
    scidac_csuma=0;
 | 
			
		||||
    scidac_csumb=0;
 | 
			
		||||
@@ -362,18 +366,22 @@ class BinaryIO {
 | 
			
		||||
#else 
 | 
			
		||||
	assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
      } else { 
 | 
			
		||||
	std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : "
 | 
			
		||||
		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
 | 
			
		||||
	std::ifstream fin;
 | 
			
		||||
	fin.open(file,std::ios::binary|std::ios::in);
 | 
			
		||||
	if ( control & BINARYIO_MASTER_APPEND )  {
 | 
			
		||||
	  fin.seekg(-sizeof(fobj),fin.end);
 | 
			
		||||
	} else { 
 | 
			
		||||
	  fin.seekg(offset+myrank*lsites*sizeof(fobj));
 | 
			
		||||
	}
 | 
			
		||||
	fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0);
 | 
			
		||||
	fin.close();
 | 
			
		||||
      } else {
 | 
			
		||||
        std::cout << GridLogMessage << "C++ read I/O " << file << " : "
 | 
			
		||||
                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
 | 
			
		||||
        std::ifstream fin;
 | 
			
		||||
        fin.open(file, std::ios::binary | std::ios::in);
 | 
			
		||||
        if (control & BINARYIO_MASTER_APPEND)
 | 
			
		||||
        {
 | 
			
		||||
          fin.seekg(-sizeof(fobj), fin.end);
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          fin.seekg(offset + myrank * lsites * sizeof(fobj));
 | 
			
		||||
        }
 | 
			
		||||
        fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
 | 
			
		||||
        assert(fin.fail() == 0);
 | 
			
		||||
        fin.close();
 | 
			
		||||
      }
 | 
			
		||||
      timer.Stop();
 | 
			
		||||
 | 
			
		||||
@@ -405,30 +413,78 @@ class BinaryIO {
 | 
			
		||||
      timer.Start();
 | 
			
		||||
      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 | 
			
		||||
#ifdef USE_MPI_IO
 | 
			
		||||
	std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
 | 
			
		||||
	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
 | 
			
		||||
	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);                        assert(ierr==0);
 | 
			
		||||
	ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);                                        assert(ierr==0);
 | 
			
		||||
	MPI_File_close(&fh);
 | 
			
		||||
	MPI_Type_free(&fileArray);
 | 
			
		||||
	MPI_Type_free(&localArray);
 | 
			
		||||
        std::cout << GridLogMessage << "MPI write I/O " << file << std::endl;
 | 
			
		||||
        ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
 | 
			
		||||
        std::cout << GridLogMessage << "Checking for errors" << std::endl;
 | 
			
		||||
        if (ierr != MPI_SUCCESS)
 | 
			
		||||
        {
 | 
			
		||||
          char error_string[BUFSIZ];
 | 
			
		||||
          int length_of_error_string, error_class;
 | 
			
		||||
 | 
			
		||||
          MPI_Error_class(ierr, &error_class);
 | 
			
		||||
          MPI_Error_string(error_class, error_string, &length_of_error_string);
 | 
			
		||||
          fprintf(stderr, "%3d: %s\n", myrank, error_string);
 | 
			
		||||
          MPI_Error_string(ierr, error_string, &length_of_error_string);
 | 
			
		||||
          fprintf(stderr, "%3d: %s\n", myrank, error_string);
 | 
			
		||||
          MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
 | 
			
		||||
        ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
 | 
			
		||||
        assert(ierr == 0);
 | 
			
		||||
 | 
			
		||||
        std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
 | 
			
		||||
        ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
 | 
			
		||||
        assert(ierr == 0);
 | 
			
		||||
 | 
			
		||||
        MPI_File_close(&fh);
 | 
			
		||||
        MPI_Type_free(&fileArray);
 | 
			
		||||
        MPI_Type_free(&localArray);
 | 
			
		||||
#else 
 | 
			
		||||
	assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
      } else { 
 | 
			
		||||
	std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
 | 
			
		||||
	std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : "
 | 
			
		||||
		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
 | 
			
		||||
	if ( control & BINARYIO_MASTER_APPEND )  {
 | 
			
		||||
        
 | 
			
		||||
	std::ofstream fout; 
 | 
			
		||||
  fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
 | 
			
		||||
  try {
 | 
			
		||||
    fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
 | 
			
		||||
  } catch (const std::fstream::failure& exc) {
 | 
			
		||||
    std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
 | 
			
		||||
    std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
 | 
			
		||||
    std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
 | 
			
		||||
    #ifdef USE_MPI_IO
 | 
			
		||||
    MPI_Abort(MPI_COMM_WORLD,1);
 | 
			
		||||
    #else
 | 
			
		||||
    exit(1);
 | 
			
		||||
    #endif
 | 
			
		||||
  }
 | 
			
		||||
	std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : "
 | 
			
		||||
		        << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
 | 
			
		||||
	
 | 
			
		||||
  if ( control & BINARYIO_MASTER_APPEND )  {
 | 
			
		||||
	  fout.seekp(0,fout.end);
 | 
			
		||||
	} else {
 | 
			
		||||
	  fout.seekp(offset+myrank*lsites*sizeof(fobj));
 | 
			
		||||
	}
 | 
			
		||||
	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
 | 
			
		||||
  
 | 
			
		||||
  try {
 | 
			
		||||
  	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
 | 
			
		||||
  }
 | 
			
		||||
  catch (const std::fstream::failure& exc) {
 | 
			
		||||
    std::cout << "Exception in writing file " << file << std::endl;
 | 
			
		||||
    std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
 | 
			
		||||
    #ifdef USE_MPI_IO
 | 
			
		||||
    MPI_Abort(MPI_COMM_WORLD,1);
 | 
			
		||||
    #else
 | 
			
		||||
    exit(1);
 | 
			
		||||
    #endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
	fout.close();
 | 
			
		||||
      }
 | 
			
		||||
      timer.Stop();
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  timer.Stop();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage<<"IOobject: ";
 | 
			
		||||
    if ( control & BINARYIO_READ) std::cout << " read  ";
 | 
			
		||||
@@ -442,11 +498,14 @@ class BinaryIO {
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Safety check
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    grid->Barrier();
 | 
			
		||||
    grid->GlobalSum(nersc_csum);
 | 
			
		||||
    grid->GlobalXOR(scidac_csuma);
 | 
			
		||||
    grid->GlobalXOR(scidac_csumb);
 | 
			
		||||
    grid->Barrier();
 | 
			
		||||
    // if the data size is 1 we do not want to sum over the MPI ranks
 | 
			
		||||
    if (iodata.size() != 1){
 | 
			
		||||
      grid->Barrier();
 | 
			
		||||
      grid->GlobalSum(nersc_csum);
 | 
			
		||||
      grid->GlobalXOR(scidac_csuma);
 | 
			
		||||
      grid->GlobalXOR(scidac_csumb);
 | 
			
		||||
      grid->Barrier();
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -546,9 +605,9 @@ class BinaryIO {
 | 
			
		||||
    int gsites = grid->gSites();
 | 
			
		||||
    int lsites = grid->lSites();
 | 
			
		||||
 | 
			
		||||
    uint32_t nersc_csum_tmp;
 | 
			
		||||
    uint32_t scidac_csuma_tmp;
 | 
			
		||||
    uint32_t scidac_csumb_tmp;
 | 
			
		||||
    uint32_t nersc_csum_tmp   = 0;
 | 
			
		||||
    uint32_t scidac_csuma_tmp = 0;
 | 
			
		||||
    uint32_t scidac_csumb_tmp = 0;
 | 
			
		||||
 | 
			
		||||
    GridStopWatch timer;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -165,7 +165,7 @@ class HMCResourceManager {
 | 
			
		||||
  // Grids
 | 
			
		||||
  //////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
  void AddGrid(std::string s, GridModule& M) {
 | 
			
		||||
  void AddGrid(const std::string s, GridModule& M) {
 | 
			
		||||
    // Check for name clashes
 | 
			
		||||
    auto search = Grids.find(s);
 | 
			
		||||
    if (search != Grids.end()) {
 | 
			
		||||
@@ -174,14 +174,24 @@ class HMCResourceManager {
 | 
			
		||||
      exit(1);
 | 
			
		||||
    }
 | 
			
		||||
    Grids[s] = std::move(M);
 | 
			
		||||
    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "HMCResourceManager:" << std::endl;
 | 
			
		||||
    std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl;
 | 
			
		||||
    Grids[s].show_full_decomposition();
 | 
			
		||||
    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Add a named grid set, 4d shortcut
 | 
			
		||||
  void AddFourDimGrid(std::string s) {
 | 
			
		||||
  void AddFourDimGrid(const std::string s) {
 | 
			
		||||
    GridFourDimModule<vComplex> Mod;
 | 
			
		||||
    AddGrid(s, Mod);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Add a named grid set, 4d shortcut + tweak simd lanes
 | 
			
		||||
  void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) {
 | 
			
		||||
    GridFourDimModule<vComplex> Mod(simd_decomposition);
 | 
			
		||||
    AddGrid(s, Mod);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  GridCartesian* GetCartesian(std::string s = "") {
 | 
			
		||||
 
 | 
			
		||||
@@ -33,28 +33,29 @@ directory
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
// Resources
 | 
			
		||||
// Modules for grids 
 | 
			
		||||
// Modules for grids
 | 
			
		||||
 | 
			
		||||
// Introduce another namespace HMCModules?
 | 
			
		||||
 | 
			
		||||
class GridModuleParameters: Serializable{   
 | 
			
		||||
class GridModuleParameters: Serializable{
 | 
			
		||||
public:
 | 
			
		||||
  GRID_SERIALIZABLE_CLASS_MEMBERS(GridModuleParameters,
 | 
			
		||||
  std::string, lattice,
 | 
			
		||||
  std::string, mpi);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> getLattice(){return strToVec<int>(lattice);}
 | 
			
		||||
  std::vector<int> getMpi()    {return strToVec<int>(mpi);}
 | 
			
		||||
  std::vector<int> getLattice() const {return strToVec<int>(lattice);}
 | 
			
		||||
  std::vector<int> getMpi()     const {return strToVec<int>(mpi);}
 | 
			
		||||
 | 
			
		||||
  void check(){
 | 
			
		||||
    if (getLattice().size() != getMpi().size()) {
 | 
			
		||||
      std::cout << GridLogError 
 | 
			
		||||
 | 
			
		||||
  void check() const {
 | 
			
		||||
    if (getLattice().size() != getMpi().size() ) {
 | 
			
		||||
      std::cout << GridLogError
 | 
			
		||||
                << "Error in GridModuleParameters: lattice and mpi dimensions "
 | 
			
		||||
                   "do not match"
 | 
			
		||||
                << std::endl;
 | 
			
		||||
      exit(1);
 | 
			
		||||
    }
 | 
			
		||||
  }    
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template <class ReaderClass>
 | 
			
		||||
  GridModuleParameters(Reader<ReaderClass>& Reader, std::string n = "LatticeGrid"):name(n) {
 | 
			
		||||
@@ -75,51 +76,94 @@ private:
 | 
			
		||||
// Lower level class
 | 
			
		||||
class GridModule {
 | 
			
		||||
 public:
 | 
			
		||||
  GridCartesian* get_full() { 
 | 
			
		||||
  GridCartesian* get_full() {
 | 
			
		||||
    std::cout << GridLogDebug << "Getting cartesian in module"<< std::endl;
 | 
			
		||||
    return grid_.get(); }
 | 
			
		||||
  GridRedBlackCartesian* get_rb() { 
 | 
			
		||||
  GridRedBlackCartesian* get_rb() {
 | 
			
		||||
    std::cout << GridLogDebug << "Getting rb-cartesian in module"<< std::endl;
 | 
			
		||||
    return rbgrid_.get(); }
 | 
			
		||||
 | 
			
		||||
  void set_full(GridCartesian* grid) { grid_.reset(grid); }
 | 
			
		||||
  void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); }
 | 
			
		||||
  void show_full_decomposition(){ grid_->show_decomposition(); }
 | 
			
		||||
  void show_rb_decomposition(){ rbgrid_->show_decomposition(); }
 | 
			
		||||
 | 
			
		||||
 protected:
 | 
			
		||||
  std::unique_ptr<GridCartesian> grid_;
 | 
			
		||||
  std::unique_ptr<GridRedBlackCartesian> rbgrid_;
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////
 | 
			
		||||
// Classes for the user
 | 
			
		||||
////////////////////////////////////
 | 
			
		||||
// Note: the space time grid should be out of the QCD namespace
 | 
			
		||||
template< class vector_type>
 | 
			
		||||
class GridFourDimModule : public GridModule {
 | 
			
		||||
 public:
 | 
			
		||||
  GridFourDimModule() {
 | 
			
		||||
template <class vector_type>
 | 
			
		||||
class GridFourDimModule : public GridModule
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
  GridFourDimModule()
 | 
			
		||||
  {
 | 
			
		||||
    using namespace QCD;
 | 
			
		||||
    set_full(SpaceTimeGrid::makeFourDimGrid(
 | 
			
		||||
        GridDefaultLatt(), GridDefaultSimd(4, vector_type::Nsimd()),
 | 
			
		||||
        GridDefaultLatt(), 
 | 
			
		||||
        GridDefaultSimd(4, vector_type::Nsimd()),
 | 
			
		||||
        GridDefaultMpi()));
 | 
			
		||||
    set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  GridFourDimModule(GridModuleParameters Params) {
 | 
			
		||||
  GridFourDimModule(const std::vector<int> tweak_simd)
 | 
			
		||||
  {
 | 
			
		||||
    using namespace QCD;
 | 
			
		||||
    if (tweak_simd.size() != 4)
 | 
			
		||||
    {
 | 
			
		||||
      std::cout << GridLogError
 | 
			
		||||
                << "Error in GridFourDimModule: SIMD size different from 4" 
 | 
			
		||||
                << std::endl;
 | 
			
		||||
      exit(1);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Checks that the product agrees with the expectation
 | 
			
		||||
    int simd_sum = 1;
 | 
			
		||||
    for (auto &n : tweak_simd)
 | 
			
		||||
      simd_sum *= n;
 | 
			
		||||
    std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << "  Sum: " << simd_sum << std::endl;
 | 
			
		||||
 | 
			
		||||
    if (simd_sum == vector_type::Nsimd())
 | 
			
		||||
    {
 | 
			
		||||
      set_full(SpaceTimeGrid::makeFourDimGrid(
 | 
			
		||||
          GridDefaultLatt(), 
 | 
			
		||||
          tweak_simd, 
 | 
			
		||||
          GridDefaultMpi()));
 | 
			
		||||
      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
      std::cout << GridLogError 
 | 
			
		||||
                << "Error in GridFourDimModule: SIMD lanes must sum to " 
 | 
			
		||||
                << vector_type::Nsimd() 
 | 
			
		||||
                << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  GridFourDimModule(const GridModuleParameters Params)
 | 
			
		||||
  {
 | 
			
		||||
    using namespace QCD;
 | 
			
		||||
    Params.check();
 | 
			
		||||
    std::vector<int> lattice_v = Params.getLattice();
 | 
			
		||||
    std::vector<int> mpi_v = Params.getMpi();
 | 
			
		||||
    if (lattice_v.size() == 4) {
 | 
			
		||||
    if (lattice_v.size() == 4)
 | 
			
		||||
    {
 | 
			
		||||
      set_full(SpaceTimeGrid::makeFourDimGrid(
 | 
			
		||||
          lattice_v, GridDefaultSimd(4, vector_type::Nsimd()),
 | 
			
		||||
          lattice_v, 
 | 
			
		||||
          GridDefaultSimd(4, vector_type::Nsimd()),
 | 
			
		||||
          mpi_v));
 | 
			
		||||
      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
 | 
			
		||||
    } else {
 | 
			
		||||
      std::cout << GridLogError 
 | 
			
		||||
          << "Error in GridFourDimModule: lattice dimension different from 4"
 | 
			
		||||
          << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
      std::cout << GridLogError
 | 
			
		||||
                << "Error in GridFourDimModule: lattice dimension different from 4"
 | 
			
		||||
                << std::endl;
 | 
			
		||||
      exit(1);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
@@ -381,7 +381,7 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
  std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
 | 
			
		||||
    std::cout<<GridLogMessage<<"Grid Decomposition\n";
 | 
			
		||||
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -40,12 +40,6 @@ namespace Grid{
 | 
			
		||||
				    double, StoppingCondition,
 | 
			
		||||
				    int, MaxCGIterations,
 | 
			
		||||
				    bool, ApplySmearing);
 | 
			
		||||
 | 
			
		||||
    //template <class ReaderClass >
 | 
			
		||||
    //FermionParameters(Reader<ReaderClass>& Reader){
 | 
			
		||||
    //  read(Reader, "Mobius", *this);
 | 
			
		||||
    //}
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
@@ -113,9 +107,17 @@ int main(int argc, char **argv) {
 | 
			
		||||
  bool ApplySmearing = MyParams.Mobius.ApplySmearing;
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  // Use this if you want to tweak the default decomposition
 | 
			
		||||
  // commented out as very architecture speficic
 | 
			
		||||
  
 | 
			
		||||
  //std::vector<int> simd_lanes({2,2,1,1});
 | 
			
		||||
 | 
			
		||||
  // Grid from the command line
 | 
			
		||||
  TheHMC.Resources.AddFourDimGrid("gauge");
 | 
			
		||||
  // Grid from the command line arguments --grid and --mpi
 | 
			
		||||
  // drop the simd_lanes argument to fall back to the default decomposition for the SIMD lanes
 | 
			
		||||
  
 | 
			
		||||
  //TheHMC.Resources.AddFourDimGrid("gauge", simd_lanes); // tweak the SIMD lanes
 | 
			
		||||
  TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
 | 
			
		||||
  
 | 
			
		||||
  // Possibile to create the module by hand 
 | 
			
		||||
  // hardcoding parameters or using a Reader
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user