mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Tidy up of mpi3; also some cleaning of the dslash controls.
This commit is contained in:
		@@ -48,8 +48,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=32;lat+=2){
 | 
			
		||||
  int maxlat=32;
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=2){
 | 
			
		||||
    for(int Ls=1;Ls<=16;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
@@ -124,7 +124,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=32;lat+=2){
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=2){
 | 
			
		||||
    for(int Ls=1;Ls<=16;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
@@ -199,7 +199,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=32;lat+=2){
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=2){
 | 
			
		||||
    for(int Ls=1;Ls<=16;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
@@ -271,131 +271,5 @@ int main (int argc, char ** argv)
 | 
			
		||||
      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
  }    
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=32;lat+=2){
 | 
			
		||||
    for(int Ls=1;Ls<=16;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 | 
			
		||||
      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      int ncomm;
 | 
			
		||||
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      std::vector<CartesianCommunicator::CommsRequest_t> empty;
 | 
			
		||||
      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_fwd(Nd,empty);
 | 
			
		||||
      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_bwd(Nd,empty);
 | 
			
		||||
 | 
			
		||||
      for(int mu=0;mu<4;mu++){
 | 
			
		||||
	ncomm=0;
 | 
			
		||||
	if (mpi_layout[mu]>1 ) {
 | 
			
		||||
	  ncomm++;
 | 
			
		||||
 | 
			
		||||
	  int comm_proc;
 | 
			
		||||
	  int xmit_to_rank;
 | 
			
		||||
	  int recv_from_rank;
 | 
			
		||||
 | 
			
		||||
	  comm_proc=1;
 | 
			
		||||
	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 | 
			
		||||
	  Grid.SendToRecvFromInit(requests_fwd[mu],
 | 
			
		||||
				  (void *)&xbuf[mu][0],
 | 
			
		||||
				  xmit_to_rank,
 | 
			
		||||
				  (void *)&rbuf[mu][0],
 | 
			
		||||
				  recv_from_rank,
 | 
			
		||||
				  bytes);
 | 
			
		||||
 | 
			
		||||
	  comm_proc = mpi_layout[mu]-1;
 | 
			
		||||
	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 | 
			
		||||
	  Grid.SendToRecvFromInit(requests_bwd[mu],
 | 
			
		||||
				  (void *)&xbuf[mu+4][0],
 | 
			
		||||
				  xmit_to_rank,
 | 
			
		||||
				  (void *)&rbuf[mu+4][0],
 | 
			
		||||
				  recv_from_rank,
 | 
			
		||||
				  bytes);
 | 
			
		||||
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      {
 | 
			
		||||
	double start=usecond();
 | 
			
		||||
	for(int i=0;i<Nloop;i++){
 | 
			
		||||
	  
 | 
			
		||||
	  for(int mu=0;mu<4;mu++){
 | 
			
		||||
	    
 | 
			
		||||
	    if (mpi_layout[mu]>1 ) {
 | 
			
		||||
	      
 | 
			
		||||
	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
 | 
			
		||||
	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
 | 
			
		||||
	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
 | 
			
		||||
	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
 | 
			
		||||
	    }
 | 
			
		||||
	  }
 | 
			
		||||
	  Grid.Barrier();
 | 
			
		||||
	}
 | 
			
		||||
	
 | 
			
		||||
	double stop=usecond();
 | 
			
		||||
	
 | 
			
		||||
	double dbytes    = bytes;
 | 
			
		||||
	double xbytes    = Nloop*dbytes*2.0*ncomm;
 | 
			
		||||
	double rbytes    = xbytes;
 | 
			
		||||
	double bidibytes = xbytes+rbytes;
 | 
			
		||||
	
 | 
			
		||||
	double time = stop-start;
 | 
			
		||||
	
 | 
			
		||||
	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      {
 | 
			
		||||
	double start=usecond();
 | 
			
		||||
	for(int i=0;i<Nloop;i++){
 | 
			
		||||
	  
 | 
			
		||||
	  for(int mu=0;mu<4;mu++){
 | 
			
		||||
	    
 | 
			
		||||
	    if (mpi_layout[mu]>1 ) {
 | 
			
		||||
	      
 | 
			
		||||
	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
 | 
			
		||||
	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
 | 
			
		||||
	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
 | 
			
		||||
	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
 | 
			
		||||
	    }
 | 
			
		||||
	  }
 | 
			
		||||
	  Grid.Barrier();
 | 
			
		||||
	}
 | 
			
		||||
	
 | 
			
		||||
	double stop=usecond();
 | 
			
		||||
	
 | 
			
		||||
	double dbytes    = bytes;
 | 
			
		||||
	double xbytes    = Nloop*dbytes*2.0*ncomm;
 | 
			
		||||
	double rbytes    = xbytes;
 | 
			
		||||
	double bidibytes = xbytes+rbytes;
 | 
			
		||||
	
 | 
			
		||||
	double time = stop-start;
 | 
			
		||||
	
 | 
			
		||||
	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -44,7 +44,6 @@ struct scal {
 | 
			
		||||
    Gamma::GammaT
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
bool overlapComms = false;
 | 
			
		||||
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 | 
			
		||||
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
 | 
			
		||||
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
 | 
			
		||||
@@ -54,10 +53,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
 | 
			
		||||
    overlapComms = true;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
 | 
			
		||||
@@ -126,14 +121,21 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  RealD NP = UGrid->_Nprocessors;
 | 
			
		||||
 | 
			
		||||
  for(int doasm=1;doasm<2;doasm++){
 | 
			
		||||
 | 
			
		||||
    QCD::WilsonKernelsStatic::AsmOpt=doasm;
 | 
			
		||||
 | 
			
		||||
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "Naive wilson implementation "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "Calling Dw"<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
 | 
			
		||||
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
 | 
			
		||||
  int ncall =100;
 | 
			
		||||
  if (1) {
 | 
			
		||||
 | 
			
		||||
@@ -162,6 +164,17 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  if (1)
 | 
			
		||||
  {
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
			
		||||
    std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::Dhop "<<std::endl;
 | 
			
		||||
    std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
 | 
			
		||||
    if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
    if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 | 
			
		||||
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
			
		||||
 | 
			
		||||
    typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 | 
			
		||||
    LatticeFermion ssrc(sFGrid);
 | 
			
		||||
    LatticeFermion sref(sFGrid);
 | 
			
		||||
@@ -248,6 +261,16 @@ int main (int argc, char ** argv)
 | 
			
		||||
      sr_e = zero;
 | 
			
		||||
      sr_o = zero;
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
 | 
			
		||||
      if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
      if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 | 
			
		||||
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
			
		||||
 | 
			
		||||
      sDw.ZeroCounters();
 | 
			
		||||
      sDw.stat.init("DhopEO");
 | 
			
		||||
      double t0=usecond();
 | 
			
		||||
@@ -308,7 +331,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
    ref = -0.5*ref;
 | 
			
		||||
  }
 | 
			
		||||
  Dw.Dhop(src,result,1);
 | 
			
		||||
  std::cout << GridLogMessage << "Naive wilson implementation Dag" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
			
		||||
@@ -322,13 +345,22 @@ int main (int argc, char ** argv)
 | 
			
		||||
  LatticeFermion r_eo  (FGrid);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "Calling Deo and Doe"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Calling Deo and Doe and assert Deo+Doe == Dunprec"<<std::endl;
 | 
			
		||||
  pickCheckerboard(Even,src_e,src);
 | 
			
		||||
  pickCheckerboard(Odd,src_o,src);
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO                "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
 | 
			
		||||
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
			
		||||
  {
 | 
			
		||||
    Dw.ZeroCounters();
 | 
			
		||||
    double t0=usecond();
 | 
			
		||||
@@ -366,8 +398,5 @@ int main (int argc, char ** argv)
 | 
			
		||||
  assert(norm2(src_e)<1.0e-5);
 | 
			
		||||
  assert(norm2(src_o)<1.0e-5);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -1,153 +0,0 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./benchmarks/Benchmark_dwf.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
template<class d>
 | 
			
		||||
struct scal {
 | 
			
		||||
  d internal;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
  Gamma::GammaMatrix Gmu [] = {
 | 
			
		||||
    Gamma::GammaX,
 | 
			
		||||
    Gamma::GammaY,
 | 
			
		||||
    Gamma::GammaZ,
 | 
			
		||||
    Gamma::GammaT
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
bool overlapComms = false;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
 | 
			
		||||
    overlapComms = true;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::vector<int> latt4 = GridDefaultLatt();
 | 
			
		||||
  const int Ls=16;
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
 | 
			
		||||
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion src   (FGrid); random(RNG5,src);
 | 
			
		||||
  LatticeFermion result(FGrid); result=zero;
 | 
			
		||||
  LatticeFermion    ref(FGrid);    ref=zero;
 | 
			
		||||
  LatticeFermion    tmp(FGrid);
 | 
			
		||||
  LatticeFermion    err(FGrid);
 | 
			
		||||
 | 
			
		||||
  ColourMatrix cm = Complex(1.0,0.0);
 | 
			
		||||
 | 
			
		||||
  LatticeGaugeField Umu(UGrid); 
 | 
			
		||||
  random(RNG4,Umu);
 | 
			
		||||
 | 
			
		||||
  LatticeGaugeField Umu5d(FGrid); 
 | 
			
		||||
 | 
			
		||||
  // replicate across fifth dimension
 | 
			
		||||
  for(int ss=0;ss<Umu._grid->oSites();ss++){
 | 
			
		||||
    for(int s=0;s<Ls;s++){
 | 
			
		||||
      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
  // Naive wilson implementation
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
  std::vector<LatticeColourMatrix> U(4,FGrid);
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if (1)
 | 
			
		||||
  {
 | 
			
		||||
    ref = zero;
 | 
			
		||||
    for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
 | 
			
		||||
      tmp = U[mu]*Cshift(src,mu+1,1);
 | 
			
		||||
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
 | 
			
		||||
 | 
			
		||||
      tmp =adj(U[mu])*src;
 | 
			
		||||
      tmp =Cshift(tmp,mu+1,-1);
 | 
			
		||||
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
 | 
			
		||||
    }
 | 
			
		||||
    ref = -0.5*ref;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  RealD M5  =1.8;
 | 
			
		||||
 | 
			
		||||
  typename DomainWallFermionR::ImplParams params; 
 | 
			
		||||
  params.overlapCommsCompute = overlapComms;
 | 
			
		||||
  
 | 
			
		||||
  RealD NP = UGrid->_Nprocessors;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  QCD::WilsonKernelsStatic::AsmOpt=1;
 | 
			
		||||
 | 
			
		||||
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
 | 
			
		||||
  
 | 
			
		||||
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
 | 
			
		||||
  int ncall =50;
 | 
			
		||||
  if (1) {
 | 
			
		||||
 | 
			
		||||
    double t0=usecond();
 | 
			
		||||
    for(int i=0;i<ncall;i++){
 | 
			
		||||
      Dw.Dhop(src,result,0);
 | 
			
		||||
    }
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
    
 | 
			
		||||
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
    double flops=1344*volume*ncall;
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
			
		||||
    err = ref-result; 
 | 
			
		||||
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
    //    Dw.Report();
 | 
			
		||||
  }
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
@@ -51,16 +51,18 @@ int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
 | 
			
		||||
  const int Ls=8;
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  if ( getenv("ASMOPT") )  {
 | 
			
		||||
    QCD::WilsonKernelsStatic::AsmOpt=1;
 | 
			
		||||
  } else { 
 | 
			
		||||
    QCD::WilsonKernelsStatic::AsmOpt=0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -58,6 +58,19 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
  RealD mass = 0.1;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop                  "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
 | 
			
		||||
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -1,175 +0,0 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./tests/Test_zmm.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
 | 
			
		||||
 | 
			
		||||
int main(int argc,char **argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
  std::ofstream os("zmm.dat");
 | 
			
		||||
 | 
			
		||||
  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "= Benchmarking ZMM"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Volume \t\t\t\tC++DW/MFLOPs\tASM-DW/MFLOPs\tdiff"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
 | 
			
		||||
  for(int L=4;L<=32;L+=4){
 | 
			
		||||
    for(int m=1;m<=2;m++){
 | 
			
		||||
      for(int Ls=8;Ls<=16;Ls+=8){
 | 
			
		||||
	std::vector<int> grid({L,L,m*L,m*L});
 | 
			
		||||
  std::cout << GridLogMessage <<"\t";
 | 
			
		||||
	for(int i=0;i<4;i++) { 
 | 
			
		||||
	  std::cout << grid[i]<<"x";
 | 
			
		||||
	}
 | 
			
		||||
	std::cout << Ls<<"\t\t";
 | 
			
		||||
	bench(os,grid,Ls);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
 | 
			
		||||
  GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion src (FGrid);
 | 
			
		||||
  LatticeFermion tmp (FGrid);
 | 
			
		||||
  LatticeFermion srce(FrbGrid);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion resulto(FrbGrid); resulto=zero;
 | 
			
		||||
  LatticeFermion resulta(FrbGrid); resulta=zero;
 | 
			
		||||
  LatticeFermion junk(FrbGrid); junk=zero;
 | 
			
		||||
  LatticeFermion diff(FrbGrid); 
 | 
			
		||||
  LatticeGaugeField Umu(UGrid);
 | 
			
		||||
 | 
			
		||||
  double mfc, mfa, mfo, mfl1;
 | 
			
		||||
 | 
			
		||||
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
  random(RNG5,src);
 | 
			
		||||
#if 1
 | 
			
		||||
  random(RNG4,Umu);
 | 
			
		||||
#else
 | 
			
		||||
  int mmu=2;
 | 
			
		||||
  std::vector<LatticeColourMatrix> U(4,UGrid);
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
    if ( mu!=mmu ) U[mu] = zero;
 | 
			
		||||
    if ( mu==mmu ) U[mu] = 1.0;
 | 
			
		||||
    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
 pickCheckerboard(Even,srce,src);
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  RealD M5  =1.8;
 | 
			
		||||
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
 | 
			
		||||
  int ncall=50;
 | 
			
		||||
  double t0=usecond();
 | 
			
		||||
  for(int i=0;i<ncall;i++){
 | 
			
		||||
    Dw.DhopOE(srce,resulto,0);
 | 
			
		||||
  }
 | 
			
		||||
  double t1=usecond();
 | 
			
		||||
 | 
			
		||||
  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
  double flops=1344*volume/2;
 | 
			
		||||
 | 
			
		||||
  mfc = flops*ncall/(t1-t0);
 | 
			
		||||
  std::cout<<mfc<<"\t\t";
 | 
			
		||||
 | 
			
		||||
  QCD::WilsonKernelsStatic::AsmOpt=1;
 | 
			
		||||
  t0=usecond();
 | 
			
		||||
  for(int i=0;i<ncall;i++){
 | 
			
		||||
    Dw.DhopOE(srce,resulta,0);
 | 
			
		||||
  }
 | 
			
		||||
  t1=usecond();
 | 
			
		||||
  mfa = flops*ncall/(t1-t0);
 | 
			
		||||
  std::cout<<mfa<<"\t\t";
 | 
			
		||||
  /*
 | 
			
		||||
  int dag=DaggerNo;
 | 
			
		||||
  t0=usecond();
 | 
			
		||||
  for(int i=0;i<1;i++){
 | 
			
		||||
    Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
 | 
			
		||||
  }
 | 
			
		||||
  t1=usecond();
 | 
			
		||||
  mfo = flops*100/(t1-t0);
 | 
			
		||||
  std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s =   "<< mfo<<std::endl;
 | 
			
		||||
 | 
			
		||||
  t0=usecond();
 | 
			
		||||
  for(int i=0;i<1;i++){
 | 
			
		||||
    Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
 | 
			
		||||
  }
 | 
			
		||||
  t1=usecond();
 | 
			
		||||
  mfl1= flops*100/(t1-t0);
 | 
			
		||||
  std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s =   "<< mfl1<<std::endl;
 | 
			
		||||
  os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
 | 
			
		||||
     << mfc<<" "
 | 
			
		||||
     << mfa<<" "
 | 
			
		||||
     << mfo<<" "
 | 
			
		||||
     << mfl1<<std::endl;
 | 
			
		||||
  */
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
    Dw.DhopOE(srce,resulta,0);
 | 
			
		||||
    PerformanceCounter Counter(i);
 | 
			
		||||
    Counter.Start();
 | 
			
		||||
    Dw.DhopOE(srce,resulta,0);
 | 
			
		||||
    Counter.Stop();
 | 
			
		||||
    Counter.Report();
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
  //resulta = (-0.5) * resulta;
 | 
			
		||||
 | 
			
		||||
  diff = resulto-resulta;
 | 
			
		||||
  std::cout<<norm2(diff)<<std::endl;
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										156
									
								
								lib/Init.cc
									
									
									
									
									
								
							
							
						
						
									
										156
									
								
								lib/Init.cc
									
									
									
									
									
								
							@@ -123,6 +123,13 @@ void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void GridCmdOptionInt(std::string &str,int & val)
 | 
			
		||||
{
 | 
			
		||||
  std::stringstream ss(str);
 | 
			
		||||
  ss>>val;
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
void GridParseLayout(char **argv,int argc,
 | 
			
		||||
		     std::vector<int> &latt,
 | 
			
		||||
@@ -153,14 +160,12 @@ void GridParseLayout(char **argv,int argc,
 | 
			
		||||
    assert(ompthreads.size()==1);
 | 
			
		||||
    GridThread::SetThreads(ompthreads[0]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
 | 
			
		||||
    std::vector<int> cores(0);
 | 
			
		||||
    int cores;
 | 
			
		||||
    arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
 | 
			
		||||
    GridCmdOptionIntVector(arg,cores);
 | 
			
		||||
    GridThread::SetCores(cores[0]);
 | 
			
		||||
    GridCmdOptionInt(arg,cores);
 | 
			
		||||
    GridThread::SetCores(cores);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::string GridCmdVectorIntToString(const std::vector<int> & vec){
 | 
			
		||||
@@ -169,7 +174,7 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
 | 
			
		||||
  return oss.str();
 | 
			
		||||
}
 | 
			
		||||
/////////////////////////////////////////////////////////
 | 
			
		||||
//
 | 
			
		||||
// Reinit guard
 | 
			
		||||
/////////////////////////////////////////////////////////
 | 
			
		||||
static int Grid_is_initialised = 0;
 | 
			
		||||
 | 
			
		||||
@@ -178,27 +183,31 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
{
 | 
			
		||||
  GridLogger::StopWatch.Start();
 | 
			
		||||
 | 
			
		||||
  std::string arg;
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
  // Shared memory block size
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
 | 
			
		||||
    int MB;
 | 
			
		||||
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
 | 
			
		||||
    GridCmdOptionInt(arg,MB);
 | 
			
		||||
    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  CartesianCommunicator::Init(argc,argv);
 | 
			
		||||
 | 
			
		||||
  // Parse command line args.
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
  // Logging
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
  std::string arg;
 | 
			
		||||
  std::vector<std::string> logstreams;
 | 
			
		||||
  std::string defaultLog("Error,Warning,Message,Performance");
 | 
			
		||||
 | 
			
		||||
  GridCmdOptionCSL(defaultLog,logstreams);
 | 
			
		||||
  GridLogConfigure(logstreams);
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
 | 
			
		||||
    std::cout<<GridLogMessage<<"--help : this message"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"--threads n     : default number of OMP threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
 | 
			
		||||
    exit(EXIT_SUCCESS);
 | 
			
		||||
  if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
 | 
			
		||||
    Grid_quiesce_nodes();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
 | 
			
		||||
@@ -207,38 +216,39 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
    GridLogConfigure(logstreams);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
 | 
			
		||||
    Grid_debug_handler_init();
 | 
			
		||||
  }
 | 
			
		||||
  if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
 | 
			
		||||
    Grid_quiesce_nodes();
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
 | 
			
		||||
    QCD::WilsonKernelsStatic::HandOpt=1;
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
 | 
			
		||||
    LebesgueOrder::UseLebesgueOrder=1;
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
 | 
			
		||||
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
 | 
			
		||||
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
 | 
			
		||||
    GridLogTimestamp(1);
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
  // Help message
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --help : this message"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"Geometry:"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --timestamp     : tag with millisecond resolution stamps"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"Performance:"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --dslash-asm    : Wilson kernel for AVX512"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --lebesgue      : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<"  --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;    
 | 
			
		||||
    std::cout<<GridLogMessage<<std::endl;
 | 
			
		||||
    exit(EXIT_SUCCESS);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  GridParseLayout(*argv,*argc,
 | 
			
		||||
		  Grid_default_latt,
 | 
			
		||||
		  Grid_default_mpi);
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
 | 
			
		||||
    std::cout<<GridLogMessage<<"Grid Decomposition\n";
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
  // Banner
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
  std::string COL_RED    = GridLogColours.colour["RED"];
 | 
			
		||||
  std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
 | 
			
		||||
@@ -248,7 +258,6 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
  std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
 | 
			
		||||
  std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
  std::cout <<std::endl;
 | 
			
		||||
  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
 | 
			
		||||
  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
 | 
			
		||||
@@ -281,6 +290,53 @@ void Grid_init(int *argc,char ***argv)
 | 
			
		||||
  std::cout << COL_BACKGROUND <<std::endl;
 | 
			
		||||
  std::cout << std::endl;
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
  // Debug and performance options
 | 
			
		||||
  ////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
 | 
			
		||||
    Grid_debug_handler_init();
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-unroll") ){
 | 
			
		||||
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptHandUnroll;
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){
 | 
			
		||||
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptInlineAsm;
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-generic") ){
 | 
			
		||||
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric;
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
 | 
			
		||||
    LebesgueOrder::UseLebesgueOrder=1;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
 | 
			
		||||
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
 | 
			
		||||
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
 | 
			
		||||
  }
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
 | 
			
		||||
    GridLogTimestamp(1);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  GridParseLayout(*argv,*argc,
 | 
			
		||||
		  Grid_default_latt,
 | 
			
		||||
		  Grid_default_mpi);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
 | 
			
		||||
 | 
			
		||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
 | 
			
		||||
    std::cout<<GridLogMessage<<"Grid Decomposition\n";
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Grid_is_initialised = 1;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -32,6 +32,7 @@ namespace Grid {
 | 
			
		||||
// Info that is setup once and indept of cartesian layout
 | 
			
		||||
///////////////////////////////////////////////////////////////
 | 
			
		||||
void *              CartesianCommunicator::ShmCommBuf;
 | 
			
		||||
uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////
 | 
			
		||||
// Alloc, free shmem region
 | 
			
		||||
@@ -41,8 +42,12 @@ void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
 | 
			
		||||
  void *ptr = (void *)heap_top;
 | 
			
		||||
  heap_top  += bytes;
 | 
			
		||||
  heap_bytes+= bytes;
 | 
			
		||||
  std::cout <<"Shm alloc "<<ptr<<std::endl;
 | 
			
		||||
  assert(heap_bytes < MAX_MPI_SHM_BYTES);
 | 
			
		||||
  if (heap_bytes >= MAX_MPI_SHM_BYTES) {
 | 
			
		||||
    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
 | 
			
		||||
    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
 | 
			
		||||
    std::cout<< " Current value is " << (MAX_MPI_SHM_BYTES/(1024*1024)) <<std::endl;
 | 
			
		||||
    assert(heap_bytes<MAX_MPI_SHM_BYTES);
 | 
			
		||||
  }
 | 
			
		||||
  return ptr;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::ShmBufferFreeAll(void) { 
 | 
			
		||||
 
 | 
			
		||||
@@ -55,7 +55,7 @@ class CartesianCommunicator {
 | 
			
		||||
  // Give external control (command line override?) of this
 | 
			
		||||
 | 
			
		||||
  static const int      MAXLOG2RANKSPERNODE = 16;            
 | 
			
		||||
  static const uint64_t MAX_MPI_SHM_BYTES   = 128*1024*1024; 
 | 
			
		||||
  static uint64_t MAX_MPI_SHM_BYTES;
 | 
			
		||||
 | 
			
		||||
  // Communicator should know nothing of the physics grid, only processor grid.
 | 
			
		||||
  int              _Nprocessors;     // How many in all
 | 
			
		||||
 
 | 
			
		||||
@@ -27,6 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include "Grid.h"
 | 
			
		||||
#include <mpi.h>
 | 
			
		||||
#include <semaphore.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
@@ -45,6 +46,7 @@ const int pool = 48;
 | 
			
		||||
 | 
			
		||||
class SlaveState {
 | 
			
		||||
public:
 | 
			
		||||
  sem_t    sem;
 | 
			
		||||
  volatile int head;
 | 
			
		||||
  volatile int start;
 | 
			
		||||
  volatile int tail;
 | 
			
		||||
@@ -56,29 +58,32 @@ public:
 | 
			
		||||
  SlaveState *state;
 | 
			
		||||
  MPI_Comm squadron;
 | 
			
		||||
  uint64_t     base;
 | 
			
		||||
  int universe_rank;
 | 
			
		||||
  int vertical_rank;
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
  // Descriptor circular pointers
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
  Slave() {};
 | 
			
		||||
 | 
			
		||||
  void Init(SlaveState * _state,MPI_Comm _squadron);
 | 
			
		||||
  void Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank);
 | 
			
		||||
  
 | 
			
		||||
  void EventLoop (void) {
 | 
			
		||||
    std::cerr<< " Entering even loop "<<std::endl;
 | 
			
		||||
    while(1) {
 | 
			
		||||
    std::cout<< " Entering event loop "<<std::endl;
 | 
			
		||||
    while(1){
 | 
			
		||||
      Event();
 | 
			
		||||
      MPI_Barrier(squadron);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void Event (void) ;
 | 
			
		||||
  int Event (void) ;
 | 
			
		||||
 | 
			
		||||
  uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int rank) ;
 | 
			
		||||
  uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;
 | 
			
		||||
 | 
			
		||||
  void WaitAll() {
 | 
			
		||||
  void QueueWaitAll() {
 | 
			
		||||
    QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
 | 
			
		||||
    std::cerr<< " Waiting on FIFO drain "<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  void WaitAll() {
 | 
			
		||||
    while ( state->tail != state->head );
 | 
			
		||||
    std::cerr<< " FIFO drained "<< state->tail <<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
@@ -119,26 +124,31 @@ public:
 | 
			
		||||
				MPI_Comm &ShmComm,
 | 
			
		||||
				void * &ShmCommBuf);
 | 
			
		||||
 | 
			
		||||
  static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int rank);
 | 
			
		||||
  static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int commrank);
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////
 | 
			
		||||
  // routines for master proc must handle any communicator
 | 
			
		||||
  /////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
  static uint64_t QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
 | 
			
		||||
    std::cerr<< " Queueing send  "<< bytes<<std::endl;
 | 
			
		||||
     //    std::cout<< " Queueing send  "<< bytes<< " slave "<< slave << " to comm "<<rank  <<std::endl;
 | 
			
		||||
    return Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  static uint64_t QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
 | 
			
		||||
    std::cerr<< " Queueing receive  "<< bytes<<std::endl;
 | 
			
		||||
    //    std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank  <<std::endl;
 | 
			
		||||
    return Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  static void WaitAll() {
 | 
			
		||||
    for(int s=1;s<VerticalSize;s++) {
 | 
			
		||||
      Slaves[s].QueueWaitAll();
 | 
			
		||||
    }
 | 
			
		||||
    MPI_Barrier(VerticalComm);
 | 
			
		||||
    for(int s=1;s<VerticalSize;s++) {
 | 
			
		||||
      Slaves[s].WaitAll();
 | 
			
		||||
    }
 | 
			
		||||
    MPI_Barrier(VerticalComm);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
 | 
			
		||||
@@ -163,6 +173,7 @@ public:
 | 
			
		||||
      GetWork(bytes,s,mywork,myoff,procs);
 | 
			
		||||
      QueueSend(s+1,&cbuf[myoff],mywork,tag,comm,rank);
 | 
			
		||||
    }
 | 
			
		||||
    MPI_Barrier(VerticalComm);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  static void QueueMultiplexedRecv(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
 | 
			
		||||
@@ -173,6 +184,7 @@ public:
 | 
			
		||||
      GetWork(bytes,s,mywork,myoff,procs);
 | 
			
		||||
      QueueRecv(s+1,&cbuf[myoff],mywork,tag,comm,rank);
 | 
			
		||||
    }
 | 
			
		||||
    MPI_Barrier(VerticalComm);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
@@ -225,7 +237,7 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
 | 
			
		||||
  // Split into groups that can share memory (Verticals)
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////
 | 
			
		||||
  //  MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
 | 
			
		||||
  MPI_Comm_split(communicator_universe,(UniverseRank&0x1),UniverseRank,&VerticalComm);
 | 
			
		||||
  MPI_Comm_split(communicator_universe,(UniverseRank/2),UniverseRank,&VerticalComm);
 | 
			
		||||
  MPI_Comm_rank(VerticalComm     ,&VerticalRank);
 | 
			
		||||
  MPI_Comm_size(VerticalComm     ,&VerticalSize);
 | 
			
		||||
  
 | 
			
		||||
@@ -262,14 +274,14 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
 | 
			
		||||
  ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
  
 | 
			
		||||
  std::cerr<<"SHM "<<ShmCommBuf<<std::endl;
 | 
			
		||||
  std::cout<<"SHM "<<ShmCommBuf<<std::endl;
 | 
			
		||||
 | 
			
		||||
  VerticalShmBufs.resize(VerticalSize);
 | 
			
		||||
  for(int r=0;r<VerticalSize;r++){
 | 
			
		||||
    MPI_Aint sz;
 | 
			
		||||
    int dsp_unit;
 | 
			
		||||
    MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
 | 
			
		||||
    std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
 | 
			
		||||
    std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -286,14 +298,16 @@ void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
  if ( VerticalRank != 0 ) {
 | 
			
		||||
    Slave indentured;
 | 
			
		||||
    indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm);
 | 
			
		||||
    indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm, UniverseRank,VerticalRank);
 | 
			
		||||
    MPI_Barrier(VerticalComm);
 | 
			
		||||
    indentured.EventLoop();
 | 
			
		||||
    assert(0);
 | 
			
		||||
  } else {
 | 
			
		||||
    Slaves.resize(VerticalSize);
 | 
			
		||||
    for(int i=1;i<VerticalSize;i++){
 | 
			
		||||
      Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm);
 | 
			
		||||
      Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm, UniverseRanks[HorizontalRank][i],i);
 | 
			
		||||
    }
 | 
			
		||||
    MPI_Barrier(VerticalComm);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  ///////////////////////////////////////////////////////////
 | 
			
		||||
@@ -337,8 +351,10 @@ void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_pee
 | 
			
		||||
 | 
			
		||||
  if ( comm == HorizontalComm ) {
 | 
			
		||||
    comm_world_peer = rank;
 | 
			
		||||
    //    std::cout << " MapCommRankToWorldRank  horiz " <<rank<<"->"<<comm_world_peer<<std::endl;
 | 
			
		||||
  } else if ( comm == communicator_cached ) {
 | 
			
		||||
    comm_world_peer = UserCommunicatorToWorldRanks[rank];
 | 
			
		||||
    //    std::cout << " MapCommRankToWorldRank  cached " <<rank<<"->"<<comm_world_peer<<std::endl;
 | 
			
		||||
  } else { 
 | 
			
		||||
    
 | 
			
		||||
    int size;
 | 
			
		||||
@@ -360,6 +376,7 @@ void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_pee
 | 
			
		||||
    MPI_Group_translate_ranks(CachedGroup,size,&cached_ranks[0],WorldGroup, &UserCommunicatorToWorldRanks[0]); 
 | 
			
		||||
    
 | 
			
		||||
    comm_world_peer = UserCommunicatorToWorldRanks[rank];
 | 
			
		||||
    //    std::cout << " MapCommRankToWorldRank  cache miss " <<rank<<"->"<<comm_world_peer<<std::endl;
 | 
			
		||||
    
 | 
			
		||||
    assert(comm_world_peer != MPI_UNDEFINED);
 | 
			
		||||
  }
 | 
			
		||||
@@ -370,55 +387,30 @@ void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_pee
 | 
			
		||||
  int comm_hash = ((icomm>>0 )&0xFFFF)^((icomm>>16)&0xFFFF)
 | 
			
		||||
                ^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF);
 | 
			
		||||
  
 | 
			
		||||
  hashtag = (comm_hash<<15) | tag;      
 | 
			
		||||
  //  hashtag = (comm_hash<<15) | tag;      
 | 
			
		||||
  hashtag = tag;      
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void Slave::Init(SlaveState * _state,MPI_Comm _squadron)
 | 
			
		||||
void Slave::Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank)
 | 
			
		||||
{
 | 
			
		||||
  squadron=_squadron;
 | 
			
		||||
  universe_rank=_universe_rank;
 | 
			
		||||
  vertical_rank=_vertical_rank;
 | 
			
		||||
  state   =_state;
 | 
			
		||||
  std::cout << "state "<<_state<<" comm "<<_squadron<<" universe_rank"<<universe_rank <<std::endl;
 | 
			
		||||
  state->head = state->tail = state->start = 0;
 | 
			
		||||
  MPI_Barrier(squadron);
 | 
			
		||||
  base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0];
 | 
			
		||||
  int rank; MPI_Comm_rank(_squadron,&rank);
 | 
			
		||||
}
 | 
			
		||||
#define PERI_PLUS(A) ( (A+1)%pool )
 | 
			
		||||
void Slave::Event (void) {
 | 
			
		||||
int Slave::Event (void) {
 | 
			
		||||
 | 
			
		||||
  static int tail_last;
 | 
			
		||||
  static int head_last;
 | 
			
		||||
  static int start_last;
 | 
			
		||||
  int ierr;
 | 
			
		||||
 | 
			
		||||
  if (   (state->tail != tail_last)
 | 
			
		||||
       ||(state->head != head_last)
 | 
			
		||||
       ||(state->start != start_last)
 | 
			
		||||
       ) { 
 | 
			
		||||
    std::cerr<< " Event loop "<< state->tail << " "<< state->start<< " "<< state->head <<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////
 | 
			
		||||
  // Try to advance the tail pointers
 | 
			
		||||
  ////////////////////////////////////////////////////
 | 
			
		||||
  /*
 | 
			
		||||
  int t=state->tail;
 | 
			
		||||
  if ( t != state->start ) {
 | 
			
		||||
    int flag=0;
 | 
			
		||||
    
 | 
			
		||||
    std::cerr<< " Testing tail "<< t<<" "<< (void *)&state->Descrs[t].request
 | 
			
		||||
	     << " "<<state->Descrs[t].request<<std::endl;
 | 
			
		||||
    //    ierr=MPI_Test((MPI_Request *)&state->Descrs[t].request,&flag,MPI_STATUS_IGNORE);
 | 
			
		||||
    //    ierr=MPI_Test((MPI_Request *)&state->Descrs[t].request,&flag,MPI_STATUS_IGNORE);
 | 
			
		||||
    assert(ierr==0);
 | 
			
		||||
    if ( flag ) {
 | 
			
		||||
      state->tail = PERI_PLUS(t);
 | 
			
		||||
      std::cerr<< " Tail advanced from "<< t<<std::endl;
 | 
			
		||||
      return;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  */
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////
 | 
			
		||||
  // Try to advance the start pointers
 | 
			
		||||
  ////////////////////////////////////////////////////
 | 
			
		||||
@@ -426,11 +418,11 @@ void Slave::Event (void) {
 | 
			
		||||
  if ( s != state->head ) {
 | 
			
		||||
    switch ( state->Descrs[s].command ) {
 | 
			
		||||
    case COMMAND_ISEND:
 | 
			
		||||
      std::cerr<< " Send "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
 | 
			
		||||
      /*
 | 
			
		||||
            std::cout<< " Send "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
 | 
			
		||||
      	       << " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
 | 
			
		||||
	       << " Comm " << MPIoffloadEngine::communicator_universe<< std::endl;
 | 
			
		||||
 | 
			
		||||
      std::cerr<< " Request was "<<state->Descrs[s].request<<std::endl;
 | 
			
		||||
       << " Comm " << MPIoffloadEngine::communicator_universe<< " me " <<universe_rank<< std::endl;
 | 
			
		||||
      */
 | 
			
		||||
      ierr = MPI_Isend((void *)(state->Descrs[s].buf+base), 
 | 
			
		||||
		       state->Descrs[s].bytes, 
 | 
			
		||||
		       MPI_CHAR,
 | 
			
		||||
@@ -438,18 +430,17 @@ void Slave::Event (void) {
 | 
			
		||||
		       state->Descrs[s].tag,
 | 
			
		||||
		       MPIoffloadEngine::communicator_universe,
 | 
			
		||||
		       (MPI_Request *)&state->Descrs[s].request);
 | 
			
		||||
      std::cerr<< " Request is "<<state->Descrs[s].request<<std::endl;
 | 
			
		||||
      std::cerr<< " Request0 is "<<state->Descrs[0].request<<std::endl;
 | 
			
		||||
      assert(ierr==0);
 | 
			
		||||
      state->start = PERI_PLUS(s);
 | 
			
		||||
      return 1;
 | 
			
		||||
      break;
 | 
			
		||||
 | 
			
		||||
    case COMMAND_IRECV:
 | 
			
		||||
      std::cerr<< " Recv "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
 | 
			
		||||
	       << " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
 | 
			
		||||
	       << " Comm " << MPIoffloadEngine::communicator_universe<< std::endl;
 | 
			
		||||
 | 
			
		||||
      std::cerr<< " Request was "<<state->Descrs[s].request<<std::endl;
 | 
			
		||||
      /*
 | 
			
		||||
      std::cout<< " Recv "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
 | 
			
		||||
	       << " from " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
 | 
			
		||||
	       << " Comm " << MPIoffloadEngine::communicator_universe<< " me "<< universe_rank<< std::endl;
 | 
			
		||||
      */
 | 
			
		||||
      ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base), 
 | 
			
		||||
		     state->Descrs[s].bytes, 
 | 
			
		||||
		     MPI_CHAR,
 | 
			
		||||
@@ -457,30 +448,32 @@ void Slave::Event (void) {
 | 
			
		||||
		     state->Descrs[s].tag,
 | 
			
		||||
		     MPIoffloadEngine::communicator_universe,
 | 
			
		||||
		     (MPI_Request *)&state->Descrs[s].request);
 | 
			
		||||
      std::cerr<< " Request is "<<state->Descrs[s].request<<std::endl;
 | 
			
		||||
      std::cerr<< " Request0 is "<<state->Descrs[0].request<<std::endl;
 | 
			
		||||
 | 
			
		||||
      //      std::cout<< " Request is "<<state->Descrs[s].request<<std::endl;
 | 
			
		||||
      //      std::cout<< " Request0 is "<<state->Descrs[0].request<<std::endl;
 | 
			
		||||
      assert(ierr==0);
 | 
			
		||||
      state->start = PERI_PLUS(s);
 | 
			
		||||
      return 1;
 | 
			
		||||
      break;
 | 
			
		||||
 | 
			
		||||
    case COMMAND_WAITALL:
 | 
			
		||||
      std::cerr<< " Wait all "<<std::endl;
 | 
			
		||||
 | 
			
		||||
      for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
 | 
			
		||||
	std::cerr<< " Wait ["<<t<<"] "<<state->Descrs[t].request <<std::endl;
 | 
			
		||||
	std::cerr<< " Request0 is "<<state->Descrs[0].request<<std::endl;
 | 
			
		||||
	MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
 | 
			
		||||
      };
 | 
			
		||||
      s=PERI_PLUS(s);
 | 
			
		||||
      state->start = s;
 | 
			
		||||
      state->tail  = s;
 | 
			
		||||
      MPI_Barrier(squadron);
 | 
			
		||||
      return 1;
 | 
			
		||||
      break;
 | 
			
		||||
 | 
			
		||||
    default:
 | 
			
		||||
      assert(0);
 | 
			
		||||
      break;
 | 
			
		||||
    }
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
  return 0;
 | 
			
		||||
}
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // External interaction with the queue
 | 
			
		||||
@@ -500,17 +493,29 @@ uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm
 | 
			
		||||
  MPI_Comm    communicator;
 | 
			
		||||
  MPI_Request request;
 | 
			
		||||
  
 | 
			
		||||
  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,commrank,tag,comm,worldrank);
 | 
			
		||||
  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,tag,comm,commrank);
 | 
			
		||||
 | 
			
		||||
  int VerticalRank = MPIoffloadEngine::VerticalRank;
 | 
			
		||||
  uint64_t relative= (uint64_t)buf - base;
 | 
			
		||||
  state->Descrs[head].buf    = relative;
 | 
			
		||||
  state->Descrs[head].bytes  = bytes;
 | 
			
		||||
  state->Descrs[head].rank   = MPIoffloadEngine::UniverseRanks[worldrank][VerticalRank];
 | 
			
		||||
  state->Descrs[head].rank   = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
 | 
			
		||||
  state->Descrs[head].tag    = hashtag;
 | 
			
		||||
  state->Descrs[head].command= command;
 | 
			
		||||
  std::cerr<< " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
 | 
			
		||||
 | 
			
		||||
  /*  
 | 
			
		||||
  if ( command == COMMAND_ISEND ) { 
 | 
			
		||||
  std::cout << "QueueSend from "<< universe_rank <<" to commrank " << commrank 
 | 
			
		||||
            << " to worldrank " << worldrank <<std::endl;
 | 
			
		||||
  std::cout << " via VerticalRank "<< vertical_rank <<" to universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
 | 
			
		||||
  std::cout << " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
 | 
			
		||||
  } 
 | 
			
		||||
  if ( command == COMMAND_IRECV ) { 
 | 
			
		||||
  std::cout << "QueueRecv on "<< universe_rank <<" from commrank " << commrank 
 | 
			
		||||
            << " from worldrank " << worldrank <<std::endl;
 | 
			
		||||
  std::cout << " via VerticalRank "<< vertical_rank <<" from universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
 | 
			
		||||
  std::cout << " QueueSend "<<buf<<"["<<bytes<<"]" << std::endl;
 | 
			
		||||
  } 
 | 
			
		||||
  */
 | 
			
		||||
  // Block until FIFO has space
 | 
			
		||||
  while( state->tail==next );
 | 
			
		||||
 | 
			
		||||
@@ -671,6 +676,8 @@ void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_
 | 
			
		||||
  // assert xmit and recv lie in shared memory region
 | 
			
		||||
  assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
 | 
			
		||||
  assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
 | 
			
		||||
  assert(from!=_processor);
 | 
			
		||||
  assert(dest!=_processor);
 | 
			
		||||
  MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
 | 
			
		||||
  MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -32,8 +32,7 @@ directory
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
int WilsonKernelsStatic::HandOpt;
 | 
			
		||||
int WilsonKernelsStatic::AsmOpt;
 | 
			
		||||
int WilsonKernelsStatic::Opt;
 | 
			
		||||
 | 
			
		||||
template <class Impl>
 | 
			
		||||
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
 | 
			
		||||
 
 | 
			
		||||
@@ -40,9 +40,9 @@ namespace QCD {
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
class WilsonKernelsStatic { 
 | 
			
		||||
 public:
 | 
			
		||||
  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
 | 
			
		||||
  // S-direction is INNERMOST and takes no part in the parity.
 | 
			
		||||
  static int AsmOpt;  // these are a temporary hack
 | 
			
		||||
  static int HandOpt; // these are a temporary hack
 | 
			
		||||
  static int Opt;  // these are a temporary hack
 | 
			
		||||
};
 | 
			
		||||
 
 | 
			
		||||
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
 | 
			
		||||
@@ -56,24 +56,40 @@ public:
 | 
			
		||||
  template <bool EnableBool = true>
 | 
			
		||||
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
 | 
			
		||||
  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
    if (AsmOpt) {
 | 
			
		||||
      WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 | 
			
		||||
    } else {
 | 
			
		||||
#else
 | 
			
		||||
		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) 
 | 
			
		||||
  {
 | 
			
		||||
#endif
 | 
			
		||||
    switch(Opt) {
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
    case OptInlineAsm:
 | 
			
		||||
      for (int site = 0; site < Ns; site++) {
 | 
			
		||||
	for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	  WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 | 
			
		||||
	  sF++;
 | 
			
		||||
	}
 | 
			
		||||
	sU++;
 | 
			
		||||
      }
 | 
			
		||||
      break;
 | 
			
		||||
#endif
 | 
			
		||||
    case OptHandUnroll:
 | 
			
		||||
      for (int site = 0; site < Ns; site++) {
 | 
			
		||||
	for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	  if (HandOpt)
 | 
			
		||||
	  WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	  else
 | 
			
		||||
	  sF++;
 | 
			
		||||
	}
 | 
			
		||||
	sU++;
 | 
			
		||||
      }
 | 
			
		||||
      break;
 | 
			
		||||
    case OptGeneric:
 | 
			
		||||
      for (int site = 0; site < Ns; site++) {
 | 
			
		||||
	for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	  WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	  sF++;
 | 
			
		||||
	}
 | 
			
		||||
	sU++;
 | 
			
		||||
      }
 | 
			
		||||
      break;
 | 
			
		||||
    default:
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
     
 | 
			
		||||
@@ -81,7 +97,7 @@ public:
 | 
			
		||||
  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
 | 
			
		||||
  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
 | 
			
		||||
     
 | 
			
		||||
    // no kernel choice  
 | 
			
		||||
    for (int site = 0; site < Ns; site++) {
 | 
			
		||||
      for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
 | 
			
		||||
@@ -95,23 +111,39 @@ public:
 | 
			
		||||
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
 | 
			
		||||
  DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 | 
			
		||||
		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
 | 
			
		||||
 | 
			
		||||
    switch(Opt) {
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
    if (AsmOpt) {
 | 
			
		||||
      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 | 
			
		||||
    } else {
 | 
			
		||||
#else
 | 
			
		||||
    {
 | 
			
		||||
#endif
 | 
			
		||||
    case OptInlineAsm:
 | 
			
		||||
      for (int site = 0; site < Ns; site++) {
 | 
			
		||||
	for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	  WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 | 
			
		||||
	  sF++;
 | 
			
		||||
	}
 | 
			
		||||
	sU++;
 | 
			
		||||
      }
 | 
			
		||||
      break;
 | 
			
		||||
#endif
 | 
			
		||||
    case OptHandUnroll:
 | 
			
		||||
      for (int site = 0; site < Ns; site++) {
 | 
			
		||||
	for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	  if (HandOpt)
 | 
			
		||||
	  WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	  else
 | 
			
		||||
	  sF++;
 | 
			
		||||
	}
 | 
			
		||||
	sU++;
 | 
			
		||||
      }
 | 
			
		||||
      break;
 | 
			
		||||
    case OptGeneric:
 | 
			
		||||
      for (int site = 0; site < Ns; site++) {
 | 
			
		||||
	for (int s = 0; s < Ls; s++) {
 | 
			
		||||
	  WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 | 
			
		||||
	  sF++;
 | 
			
		||||
	}
 | 
			
		||||
	sU++;
 | 
			
		||||
      }
 | 
			
		||||
      break;
 | 
			
		||||
    default:
 | 
			
		||||
      assert(0);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -32,7 +32,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
int LebesgueOrder::UseLebesgueOrder;
 | 
			
		||||
std::vector<int> LebesgueOrder::Block({2,2,2,2});
 | 
			
		||||
std::vector<int> LebesgueOrder::Block({8,2,2,2});
 | 
			
		||||
 | 
			
		||||
LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
 | 
			
		||||
  n--;           // 1000 0011 --> 1000 0010
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user