mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Merge branch 'develop' into temporary-smearing
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -62,6 +62,7 @@ stamp-h1
 | 
				
			|||||||
config.sub
 | 
					config.sub
 | 
				
			||||||
config.guess
 | 
					config.guess
 | 
				
			||||||
INSTALL
 | 
					INSTALL
 | 
				
			||||||
 | 
					.dirstamp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Packages #
 | 
					# Packages #
 | 
				
			||||||
############
 | 
					############
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -88,3 +88,7 @@ script:
 | 
				
			|||||||
    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
 | 
					    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
 | 
				
			||||||
    - make -j4
 | 
					    - make -j4
 | 
				
			||||||
    - ./benchmarks/Benchmark_dwf --threads 1
 | 
					    - ./benchmarks/Benchmark_dwf --threads 1
 | 
				
			||||||
 | 
					    - make clean
 | 
				
			||||||
 | 
					    - ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=double --enable-simd=SSE4 --enable-comms=none
 | 
				
			||||||
 | 
					    - make -j4
 | 
				
			||||||
 | 
					    - ./benchmarks/Benchmark_dwf --threads 1
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										4
									
								
								VERSION
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								VERSION
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,4 @@
 | 
				
			|||||||
 | 
					Version : 0.5.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- AVX512, AVX2, AVX, SSE good
 | 
				
			||||||
 | 
					- Clang 3.5 and above, ICPC v16 and above, GCC 4.9 and above
 | 
				
			||||||
@@ -165,11 +165,11 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  if (1)
 | 
					  if (1)
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
 | 
					    typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
 | 
				
			||||||
    LatticeFermionF ssrc(sFGrid);
 | 
					    LatticeFermion ssrc(sFGrid);
 | 
				
			||||||
    LatticeFermionF sref(sFGrid);
 | 
					    LatticeFermion sref(sFGrid);
 | 
				
			||||||
    LatticeFermionF sresult(sFGrid);
 | 
					    LatticeFermion sresult(sFGrid);
 | 
				
			||||||
    WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
 | 
					    WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
    for(int x=0;x<latt4[0];x++){
 | 
					    for(int x=0;x<latt4[0];x++){
 | 
				
			||||||
    for(int y=0;y<latt4[1];y++){
 | 
					    for(int y=0;y<latt4[1];y++){
 | 
				
			||||||
@@ -177,7 +177,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
    for(int t=0;t<latt4[3];t++){
 | 
					    for(int t=0;t<latt4[3];t++){
 | 
				
			||||||
    for(int s=0;s<Ls;s++){
 | 
					    for(int s=0;s<Ls;s++){
 | 
				
			||||||
      std::vector<int> site({s,x,y,z,t});
 | 
					      std::vector<int> site({s,x,y,z,t});
 | 
				
			||||||
      SpinColourVectorF tmp;
 | 
					      SpinColourVector tmp;
 | 
				
			||||||
      peekSite(tmp,src,site);
 | 
					      peekSite(tmp,src,site);
 | 
				
			||||||
      pokeSite(tmp,ssrc,site);
 | 
					      pokeSite(tmp,ssrc,site);
 | 
				
			||||||
    }}}}}
 | 
					    }}}}}
 | 
				
			||||||
@@ -217,7 +217,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
    for(int t=0;t<latt4[3];t++){
 | 
					    for(int t=0;t<latt4[3];t++){
 | 
				
			||||||
    for(int s=0;s<Ls;s++){
 | 
					    for(int s=0;s<Ls;s++){
 | 
				
			||||||
      std::vector<int> site({s,x,y,z,t});
 | 
					      std::vector<int> site({s,x,y,z,t});
 | 
				
			||||||
      SpinColourVectorF normal, simd;
 | 
					      SpinColourVector normal, simd;
 | 
				
			||||||
      peekSite(normal,result,site);
 | 
					      peekSite(normal,result,site);
 | 
				
			||||||
      peekSite(simd,sresult,site);
 | 
					      peekSite(simd,sresult,site);
 | 
				
			||||||
      sum=sum+norm2(normal-simd);
 | 
					      sum=sum+norm2(normal-simd);
 | 
				
			||||||
@@ -230,8 +230,8 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    if (1) {
 | 
					    if (1) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      LatticeFermionF sr_eo(sFGrid);
 | 
					      LatticeFermion sr_eo(sFGrid);
 | 
				
			||||||
      LatticeFermionF serr(sFGrid);
 | 
					      LatticeFermion serr(sFGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      LatticeFermion ssrc_e (sFrbGrid);
 | 
					      LatticeFermion ssrc_e (sFrbGrid);
 | 
				
			||||||
      LatticeFermion ssrc_o (sFrbGrid);
 | 
					      LatticeFermion ssrc_o (sFrbGrid);
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										369
									
								
								benchmarks/Benchmark_dwf_sweep.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										369
									
								
								benchmarks/Benchmark_dwf_sweep.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,369 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./benchmarks/Benchmark_dwf.cc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					#include <PerfCount.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					using namespace std;
 | 
				
			||||||
 | 
					using namespace Grid;
 | 
				
			||||||
 | 
					using namespace Grid::QCD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class d>
 | 
				
			||||||
 | 
					struct scal {
 | 
				
			||||||
 | 
					  d internal;
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Gamma::GammaMatrix Gmu [] = {
 | 
				
			||||||
 | 
					    Gamma::GammaX,
 | 
				
			||||||
 | 
					    Gamma::GammaY,
 | 
				
			||||||
 | 
					    Gamma::GammaZ,
 | 
				
			||||||
 | 
					    Gamma::GammaT
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void benchDw(std::vector<int> & L, int Ls, int threads, int report =0 );
 | 
				
			||||||
 | 
					void benchsDw(std::vector<int> & L, int Ls, int threads, int report=0 );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int main (int argc, char ** argv)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  Grid_init(&argc,&argv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  const int Ls=16;
 | 
				
			||||||
 | 
					  int threads = GridThread::GetThreads();
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( getenv("ASMOPT") )  {
 | 
				
			||||||
 | 
					    QCD::WilsonKernelsStatic::AsmOpt=1;
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    QCD::WilsonKernelsStatic::AsmOpt=0;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s)  "<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int Lmax=32;
 | 
				
			||||||
 | 
					  int dmin=0;
 | 
				
			||||||
 | 
					  if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
 | 
				
			||||||
 | 
					  if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
 | 
				
			||||||
 | 
					  for (int L=8;L<=Lmax;L*=2){
 | 
				
			||||||
 | 
					    std::vector<int> latt4(4,L);
 | 
				
			||||||
 | 
					    for(int d=4;d>dmin;d--){
 | 
				
			||||||
 | 
					      if ( d<=3 ) latt4[d]*=2;
 | 
				
			||||||
 | 
					      std::cout << GridLogMessage <<"\t";
 | 
				
			||||||
 | 
					      for(int d=0;d<Nd;d++){
 | 
				
			||||||
 | 
						std::cout<<latt4[d]<<"x";
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      std::cout <<Ls<<"\t" ;
 | 
				
			||||||
 | 
					      benchDw (latt4,Ls,threads,0);
 | 
				
			||||||
 | 
					      benchsDw(latt4,Ls,threads,0);
 | 
				
			||||||
 | 
					      std::cout<<std::endl;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    std::vector<int> latt4(4,16);
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "16^4 Dw miss rate"<<std::endl;
 | 
				
			||||||
 | 
					    benchDw (latt4,Ls,threads,1);
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "16^4 sDw miss rate"<<std::endl;
 | 
				
			||||||
 | 
					    benchsDw(latt4,Ls,threads,1);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Grid_finalize();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef CHECK
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
				
			||||||
 | 
					  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> seeds4({1,2,3,4});
 | 
				
			||||||
 | 
					  std::vector<int> seeds5({5,6,7,8});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef CHECK 
 | 
				
			||||||
 | 
					  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
				
			||||||
 | 
					  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
				
			||||||
 | 
					  LatticeFermion src   (FGrid); random(RNG5,src);
 | 
				
			||||||
 | 
					  LatticeGaugeField Umu(UGrid); 
 | 
				
			||||||
 | 
					  random(RNG4,Umu);
 | 
				
			||||||
 | 
					#else 
 | 
				
			||||||
 | 
					  LatticeFermion src   (FGrid); src=zero;
 | 
				
			||||||
 | 
					  LatticeGaugeField Umu(UGrid); Umu=zero;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeFermion result(FGrid); result=zero;
 | 
				
			||||||
 | 
					  LatticeFermion    ref(FGrid);    ref=zero;
 | 
				
			||||||
 | 
					  LatticeFermion    tmp(FGrid);
 | 
				
			||||||
 | 
					  LatticeFermion    err(FGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ColourMatrix cm = Complex(1.0,0.0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeGaugeField Umu5d(FGrid); 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // replicate across fifth dimension
 | 
				
			||||||
 | 
					  for(int ss=0;ss<Umu._grid->oSites();ss++){
 | 
				
			||||||
 | 
					    for(int s=0;s<Ls;s++){
 | 
				
			||||||
 | 
					      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////////
 | 
				
			||||||
 | 
					  // Naive wilson implementation
 | 
				
			||||||
 | 
					  ////////////////////////////////////
 | 
				
			||||||
 | 
					  std::vector<LatticeColourMatrix> U(4,FGrid);
 | 
				
			||||||
 | 
					  for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
 | 
					    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef CHECK
 | 
				
			||||||
 | 
					  if (1)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    ref = zero;
 | 
				
			||||||
 | 
					    for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      tmp = U[mu]*Cshift(src,mu+1,1);
 | 
				
			||||||
 | 
					      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      tmp =adj(U[mu])*src;
 | 
				
			||||||
 | 
					      tmp =Cshift(tmp,mu+1,-1);
 | 
				
			||||||
 | 
					      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    ref = -0.5*ref;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  RealD mass=0.1;
 | 
				
			||||||
 | 
					  RealD M5  =1.8;
 | 
				
			||||||
 | 
					  RealD NP = UGrid->_Nprocessors;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  double t0=usecond();
 | 
				
			||||||
 | 
					  Dw.Dhop(src,result,0);
 | 
				
			||||||
 | 
					  double t1=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef TIMERS_OFF
 | 
				
			||||||
 | 
					    int ncall =10;
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					  int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if (ncall < 5 ) exit(0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Dw.Dhop(src,result,0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  PerformanceCounter Counter(8);
 | 
				
			||||||
 | 
					  Counter.Start();
 | 
				
			||||||
 | 
					  t0=usecond();
 | 
				
			||||||
 | 
					  for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					    Dw.Dhop(src,result,0);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  t1=usecond();
 | 
				
			||||||
 | 
					  Counter.Stop();
 | 
				
			||||||
 | 
					  if ( report ) {
 | 
				
			||||||
 | 
					    Counter.Report();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if ( ! report ) 
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
				
			||||||
 | 
					      double flops=1344*volume*ncall;
 | 
				
			||||||
 | 
					      std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					#ifdef CHECK
 | 
				
			||||||
 | 
					    err = ref-result; 
 | 
				
			||||||
 | 
					    RealD errd = norm2(err);
 | 
				
			||||||
 | 
					    if ( errd> 1.0e-4 ) {
 | 
				
			||||||
 | 
					      std::cout<<GridLogMessage << "oops !!! norm diff   "<< norm2(err)<<std::endl;
 | 
				
			||||||
 | 
					      exit(-1);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					  LatticeFermion src_e (FrbGrid);
 | 
				
			||||||
 | 
					  LatticeFermion src_o (FrbGrid);
 | 
				
			||||||
 | 
					  LatticeFermion r_e   (FrbGrid);
 | 
				
			||||||
 | 
					  LatticeFermion r_o   (FrbGrid);
 | 
				
			||||||
 | 
					  LatticeFermion r_eo  (FGrid);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  pickCheckerboard(Even,src_e,src);
 | 
				
			||||||
 | 
					  pickCheckerboard(Odd,src_o,src);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
				
			||||||
 | 
					    double t0=usecond();
 | 
				
			||||||
 | 
					    for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					      Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    double t1=usecond();
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    if(!report){
 | 
				
			||||||
 | 
					      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
				
			||||||
 | 
					      double flops=(1344.0*volume*ncall)/2;
 | 
				
			||||||
 | 
					      std::cout<< flops/(t1-t0);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef CHECK_SDW
 | 
				
			||||||
 | 
					void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
				
			||||||
 | 
					  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					  GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
 | 
				
			||||||
 | 
					  GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> seeds4({1,2,3,4});
 | 
				
			||||||
 | 
					  std::vector<int> seeds5({5,6,7,8});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef CHECK_SDW
 | 
				
			||||||
 | 
					  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
				
			||||||
 | 
					  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
				
			||||||
 | 
					  LatticeFermion src   (FGrid); random(RNG5,src);
 | 
				
			||||||
 | 
					  LatticeGaugeField Umu(UGrid); 
 | 
				
			||||||
 | 
					  random(RNG4,Umu);
 | 
				
			||||||
 | 
					#else 
 | 
				
			||||||
 | 
					  LatticeFermion src   (FGrid); src=zero;
 | 
				
			||||||
 | 
					  LatticeGaugeField Umu(UGrid); Umu=zero;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeFermion result(FGrid); result=zero;
 | 
				
			||||||
 | 
					  LatticeFermion    ref(FGrid);    ref=zero;
 | 
				
			||||||
 | 
					  LatticeFermion    tmp(FGrid);
 | 
				
			||||||
 | 
					  LatticeFermion    err(FGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ColourMatrix cm = Complex(1.0,0.0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeGaugeField Umu5d(FGrid); 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // replicate across fifth dimension
 | 
				
			||||||
 | 
					  for(int ss=0;ss<Umu._grid->oSites();ss++){
 | 
				
			||||||
 | 
					    for(int s=0;s<Ls;s++){
 | 
				
			||||||
 | 
					      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  RealD mass=0.1;
 | 
				
			||||||
 | 
					  RealD M5  =1.8;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
 | 
				
			||||||
 | 
					    LatticeFermion ssrc(sFGrid);
 | 
				
			||||||
 | 
					    LatticeFermion sref(sFGrid);
 | 
				
			||||||
 | 
					    LatticeFermion sresult(sFGrid);
 | 
				
			||||||
 | 
					    WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					    for(int x=0;x<latt4[0];x++){
 | 
				
			||||||
 | 
					    for(int y=0;y<latt4[1];y++){
 | 
				
			||||||
 | 
					    for(int z=0;z<latt4[2];z++){
 | 
				
			||||||
 | 
					    for(int t=0;t<latt4[3];t++){
 | 
				
			||||||
 | 
					    for(int s=0;s<Ls;s++){
 | 
				
			||||||
 | 
					      std::vector<int> site({s,x,y,z,t});
 | 
				
			||||||
 | 
					      SpinColourVector tmp;
 | 
				
			||||||
 | 
					      peekSite(tmp,src,site);
 | 
				
			||||||
 | 
					      pokeSite(tmp,ssrc,site);
 | 
				
			||||||
 | 
					    }}}}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    double t0=usecond();
 | 
				
			||||||
 | 
					    sDw.Dhop(ssrc,sresult,0);
 | 
				
			||||||
 | 
					    double t1=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef TIMERS_OFF
 | 
				
			||||||
 | 
					    int ncall =10;
 | 
				
			||||||
 | 
					#else 
 | 
				
			||||||
 | 
					    int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    PerformanceCounter Counter(8);
 | 
				
			||||||
 | 
					    Counter.Start();
 | 
				
			||||||
 | 
					    t0=usecond();
 | 
				
			||||||
 | 
					    for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					      sDw.Dhop(ssrc,sresult,0);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    t1=usecond();
 | 
				
			||||||
 | 
					    Counter.Stop();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if ( report ) {
 | 
				
			||||||
 | 
					      Counter.Report();
 | 
				
			||||||
 | 
					    } else { 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
				
			||||||
 | 
					      double flops=1344*volume*ncall;
 | 
				
			||||||
 | 
					      std::cout<<"\t"<< flops/(t1-t0);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    LatticeFermion sr_eo(sFGrid);
 | 
				
			||||||
 | 
					    LatticeFermion serr(sFGrid);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    LatticeFermion ssrc_e (sFrbGrid);
 | 
				
			||||||
 | 
					    LatticeFermion ssrc_o (sFrbGrid);
 | 
				
			||||||
 | 
					    LatticeFermion sr_e   (sFrbGrid);
 | 
				
			||||||
 | 
					    LatticeFermion sr_o   (sFrbGrid);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					    pickCheckerboard(Even,ssrc_e,ssrc);
 | 
				
			||||||
 | 
					    pickCheckerboard(Odd,ssrc_o,ssrc);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    setCheckerboard(sr_eo,ssrc_o);
 | 
				
			||||||
 | 
					    setCheckerboard(sr_eo,ssrc_e);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    sr_e = zero;
 | 
				
			||||||
 | 
					    sr_o = zero;
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
 | 
				
			||||||
 | 
					    PerformanceCounter CounterSdw(8);
 | 
				
			||||||
 | 
					    CounterSdw.Start();
 | 
				
			||||||
 | 
					    t0=usecond();
 | 
				
			||||||
 | 
					    for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					      __SSC_START;
 | 
				
			||||||
 | 
					      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
 | 
				
			||||||
 | 
					      __SSC_STOP;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    t1=usecond();
 | 
				
			||||||
 | 
					    CounterSdw.Stop();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if ( report ) { 
 | 
				
			||||||
 | 
					      CounterSdw.Report();
 | 
				
			||||||
 | 
					    } else {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
				
			||||||
 | 
					      double flops=(1344.0*volume*ncall)/2;
 | 
				
			||||||
 | 
					      std::cout<<"\t"<< flops/(t1-t0);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -1,5 +1,5 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
 | 
					bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_dwf_sweep Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Benchmark_comms_SOURCES=Benchmark_comms.cc
 | 
					Benchmark_comms_SOURCES=Benchmark_comms.cc
 | 
				
			||||||
@@ -14,6 +14,10 @@ Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
 | 
				
			|||||||
Benchmark_dwf_ntpf_LDADD=-lGrid
 | 
					Benchmark_dwf_ntpf_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Benchmark_dwf_sweep_SOURCES=Benchmark_dwf_sweep.cc
 | 
				
			||||||
 | 
					Benchmark_dwf_sweep_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
 | 
					Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
 | 
				
			||||||
Benchmark_memory_asynch_LDADD=-lGrid
 | 
					Benchmark_memory_asynch_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										0
									
								
								lib/.dirstamp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								lib/.dirstamp
									
									
									
									
									
										Normal file
									
								
							@@ -262,7 +262,6 @@ void Grid_init(int *argc,char ***argv)
 | 
				
			|||||||
  std::cout << std::endl;
 | 
					  std::cout << std::endl;
 | 
				
			||||||
  std::cout <<COL_YELLOW<< std::endl;
 | 
					  std::cout <<COL_YELLOW<< std::endl;
 | 
				
			||||||
  std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
 | 
					  std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
 | 
				
			||||||
  std::cout << "Colours by Tadahito Boyle "<<std::endl;
 | 
					 | 
				
			||||||
  std::cout << std::endl;
 | 
					  std::cout << std::endl;
 | 
				
			||||||
  std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
 | 
					  std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
 | 
				
			||||||
  std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
 | 
					  std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
 | 
				
			||||||
@@ -274,6 +273,7 @@ void Grid_init(int *argc,char ***argv)
 | 
				
			|||||||
  std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
 | 
					  std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the"<<std::endl;
 | 
				
			||||||
  std::cout << "GNU General Public License for more details."<<std::endl;
 | 
					  std::cout << "GNU General Public License for more details."<<std::endl;
 | 
				
			||||||
  std::cout << COL_BACKGROUND <<std::endl;
 | 
					  std::cout << COL_BACKGROUND <<std::endl;
 | 
				
			||||||
 | 
					  std::cout << std::endl;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -261,6 +261,9 @@
 | 
				
			|||||||
	 }
 | 
						 }
 | 
				
			||||||
       };
 | 
					       };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					       inline uint64_t Touch(int ent) {
 | 
				
			||||||
 | 
						 //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
 | 
				
			||||||
 | 
					       }
 | 
				
			||||||
       inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
 | 
					       inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
 | 
				
			||||||
	 _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
 | 
						 _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
 | 
				
			||||||
	 local = _entries[ent]._is_local;
 | 
						 local = _entries[ent]._is_local;
 | 
				
			||||||
@@ -269,6 +272,11 @@
 | 
				
			|||||||
	 if (local) return base + _entries[ent]._byte_offset;
 | 
						 if (local) return base + _entries[ent]._byte_offset;
 | 
				
			||||||
	 else       return _entries[ent]._byte_offset;
 | 
						 else       return _entries[ent]._byte_offset;
 | 
				
			||||||
       }
 | 
					       }
 | 
				
			||||||
 | 
					       inline uint64_t GetPFInfo(int ent,uint64_t base) {
 | 
				
			||||||
 | 
						 int local = _entries[ent]._is_local;
 | 
				
			||||||
 | 
						 if (local) return base + _entries[ent]._byte_offset;
 | 
				
			||||||
 | 
						 else       return        _entries[ent]._byte_offset;
 | 
				
			||||||
 | 
					       }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
       // Comms buffers
 | 
					       // Comms buffers
 | 
				
			||||||
       std::vector<Vector<scalar_object> > u_simd_send_buf;
 | 
					       std::vector<Vector<scalar_object> > u_simd_send_buf;
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										0
									
								
								lib/algorithms/approx/.dirstamp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								lib/algorithms/approx/.dirstamp
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								lib/communicator/.dirstamp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								lib/communicator/.dirstamp
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								lib/qcd/action/fermion/.dirstamp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								lib/qcd/action/fermion/.dirstamp
									
									
									
									
									
										Normal file
									
								
							@@ -63,7 +63,7 @@ namespace Grid {
 | 
				
			|||||||
	Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
 | 
						Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
 | 
				
			||||||
	assert(zdata->n==this->Ls);
 | 
						assert(zdata->n==this->Ls);
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
 | 
						//	std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
 | 
				
			||||||
	// Call base setter
 | 
						// Call base setter
 | 
				
			||||||
	this->SetCoefficientsTanh(zdata,1.0,0.0);
 | 
						this->SetCoefficientsTanh(zdata,1.0,0.0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -53,6 +53,8 @@ namespace QCD {
 | 
				
			|||||||
	StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
 | 
						StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
 | 
				
			||||||
	StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
 | 
						StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
 | 
				
			||||||
	mass(_mass),
 | 
						mass(_mass),
 | 
				
			||||||
 | 
						Lebesgue(_grid),
 | 
				
			||||||
 | 
						LebesgueEvenOdd(_cbgrid),
 | 
				
			||||||
	Umu(&Fgrid),
 | 
						Umu(&Fgrid),
 | 
				
			||||||
	UmuEven(&Hgrid),
 | 
						UmuEven(&Hgrid),
 | 
				
			||||||
	UmuOdd (&Hgrid) 
 | 
						UmuOdd (&Hgrid) 
 | 
				
			||||||
@@ -228,7 +230,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    
 | 
					    
 | 
				
			||||||
    out.checkerboard = in.checkerboard;
 | 
					    out.checkerboard = in.checkerboard;
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    DhopInternal(Stencil,Umu,in,out,dag);
 | 
					    DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  template<class Impl>
 | 
					  template<class Impl>
 | 
				
			||||||
@@ -239,7 +241,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    assert(in.checkerboard==Even);
 | 
					    assert(in.checkerboard==Even);
 | 
				
			||||||
    out.checkerboard = Odd;
 | 
					    out.checkerboard = Odd;
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    DhopInternal(StencilEven,UmuOdd,in,out,dag);
 | 
					    DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  template<class Impl>
 | 
					  template<class Impl>
 | 
				
			||||||
@@ -250,7 +252,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    assert(in.checkerboard==Odd);
 | 
					    assert(in.checkerboard==Odd);
 | 
				
			||||||
    out.checkerboard = Even;
 | 
					    out.checkerboard = Even;
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    DhopInternal(StencilOdd,UmuEven,in,out,dag);
 | 
					    DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  template<class Impl>
 | 
					  template<class Impl>
 | 
				
			||||||
@@ -285,7 +287,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  template<class Impl>
 | 
					  template<class Impl>
 | 
				
			||||||
  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 | 
					  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,LebesgueOrder& lo,DoubledGaugeField & U,
 | 
				
			||||||
					 const FermionField &in, FermionField &out,int dag) 
 | 
										 const FermionField &in, FermionField &out,int dag) 
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
					    assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
				
			||||||
@@ -296,12 +298,12 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    if ( dag == DaggerYes ) {
 | 
					    if ( dag == DaggerYes ) {
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
 | 
						Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    } else {
 | 
					    } else {
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
	Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
 | 
						Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -111,7 +111,7 @@ namespace Grid {
 | 
				
			|||||||
			 const FermionField &B,
 | 
								 const FermionField &B,
 | 
				
			||||||
			 int dag);
 | 
								 int dag);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 | 
					      void DhopInternal(StencilImpl & st,LebesgueOrder & lo,DoubledGaugeField & U,
 | 
				
			||||||
			const FermionField &in, FermionField &out,int dag) ;
 | 
								const FermionField &in, FermionField &out,int dag) ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      // Constructor
 | 
					      // Constructor
 | 
				
			||||||
@@ -147,6 +147,10 @@ namespace Grid {
 | 
				
			|||||||
      DoubledGaugeField UmuEven;
 | 
					      DoubledGaugeField UmuEven;
 | 
				
			||||||
      DoubledGaugeField UmuOdd;
 | 
					      DoubledGaugeField UmuOdd;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      LebesgueOrder Lebesgue;
 | 
				
			||||||
 | 
					      LebesgueOrder LebesgueEvenOdd;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 | 
					    typedef WilsonFermion<WilsonImplF> WilsonFermionF;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -321,14 +321,14 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					    for(int ss=0;ss<U._grid->oSites();ss++){
 | 
				
			||||||
	int sU=ss;
 | 
						int sU=ss;
 | 
				
			||||||
	int sF=LLs*sU;
 | 
						int sF=LLs*sU;
 | 
				
			||||||
	Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
 | 
						Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else {
 | 
					  } else {
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
    for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					    for(int ss=0;ss<U._grid->oSites();ss++){
 | 
				
			||||||
      int sU=ss;
 | 
					      int sU=ss;
 | 
				
			||||||
      int sF=LLs*sU;
 | 
					      int sF=LLs*sU;
 | 
				
			||||||
      Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
 | 
					      Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -38,20 +38,23 @@ template<class Impl>
 | 
				
			|||||||
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
 | 
					WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl> 
 | 
					template<class Impl> 
 | 
				
			||||||
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
											  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
						  int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 | 
											  int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
					#ifdef AVX512
 | 
				
			||||||
  if ( AsmOpt ) {
 | 
					  if ( AsmOpt ) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out);
 | 
					    WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  } else {
 | 
					  } else {
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					  {  
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
    for(int site=0;site<Ns;site++) {
 | 
					    for(int site=0;site<Ns;site++) {
 | 
				
			||||||
      for(int s=0;s<Ls;s++) {
 | 
					      for(int s=0;s<Ls;s++) {
 | 
				
			||||||
	if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
 | 
						if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
 | 
				
			||||||
	else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
 | 
						else         WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
 | 
				
			||||||
	sF++;
 | 
						sF++;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
      sU++;
 | 
					      sU++;
 | 
				
			||||||
@@ -61,17 +64,17 @@ void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl> 
 | 
					template<class Impl> 
 | 
				
			||||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
										   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
					   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 | 
										   int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  // No asm implementation yet.
 | 
					  // No asm implementation yet.
 | 
				
			||||||
  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
 | 
					  //  if ( AsmOpt )     WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 | 
				
			||||||
  //  else
 | 
					  //  else
 | 
				
			||||||
  for(int site=0;site<Ns;site++) {
 | 
					  for(int site=0;site<Ns;site++) {
 | 
				
			||||||
    for(int s=0;s<Ls;s++) {
 | 
					    for(int s=0;s<Ls;s++) {
 | 
				
			||||||
      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
 | 
					      if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 | 
				
			||||||
      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
 | 
					      else         WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 | 
				
			||||||
      sF++;
 | 
					      sF++;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    sU++;
 | 
					    sU++;
 | 
				
			||||||
@@ -84,7 +87,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
 | 
				
			|||||||
  ////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl> 
 | 
					template<class Impl> 
 | 
				
			||||||
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
					   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
										   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
					   int sF,int sU,const FermionField &in, FermionField &out)
 | 
										   int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -262,7 +265,7 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaug
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  // Need controls to do interior, exterior, or both
 | 
					  // Need controls to do interior, exterior, or both
 | 
				
			||||||
template<class Impl> 
 | 
					template<class Impl> 
 | 
				
			||||||
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
											  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
						  int sF,int sU,const FermionField &in, FermionField &out)
 | 
											  int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -53,11 +53,11 @@ namespace Grid {
 | 
				
			|||||||
     
 | 
					     
 | 
				
			||||||
    public:
 | 
					    public:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					     void DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			   int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
								   int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					     void DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			      int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
 | 
								      int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -67,24 +67,24 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    private:
 | 
					    private:
 | 
				
			||||||
     // Specialised variants
 | 
					     // Specialised variants
 | 
				
			||||||
     void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					     void DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			   int sF,int sU, const FermionField &in, FermionField &out);
 | 
								   int sF,int sU, const FermionField &in, FermionField &out);
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
     void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					     void DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			      int sF,int sU,const FermionField &in,FermionField &out);
 | 
								      int sF,int sU,const FermionField &in,FermionField &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					     void DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
								      int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					     void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			      int sF,int sU,const FermionField &in, FermionField &out);
 | 
								      int sF,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
     
 | 
					     
 | 
				
			||||||
     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					     void DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
				 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
				 int sF,int sU,const FermionField &in, FermionField &out);
 | 
									 int sF,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
    public:
 | 
					    public:
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -39,9 +39,9 @@ namespace QCD {
 | 
				
			|||||||
  // Default to no assembler implementation
 | 
					  // Default to no assembler implementation
 | 
				
			||||||
  ///////////////////////////////////////////////////////////
 | 
					  ///////////////////////////////////////////////////////////
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
				
			||||||
					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
										       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
					       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
										       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  assert(0);
 | 
					  assert(0);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@@ -67,49 +67,54 @@ int setupSigns(void ){
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
static int signInit = setupSigns();
 | 
					static int signInit = setupSigns();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define label(A)  ilabel(A)
 | 
				
			||||||
 | 
					#define ilabel(A) ".globl\n"  #A ":\n" 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
 | 
					#define MAYBEPERM(A,perm) if (perm) { A ; }
 | 
				
			||||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 | 
					#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
 | 
				
			||||||
 | 
					#define FX(A) WILSONASM_ ##A
 | 
				
			||||||
template<>
 | 
					template<>
 | 
				
			||||||
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
				
			||||||
						     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
											     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
						     int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
											     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
				
			||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef VMOVIDUP
 | 
					#undef VMOVIDUP
 | 
				
			||||||
#undef VMOVRDUP
 | 
					#undef VMOVRDUP
 | 
				
			||||||
#undef MAYBEPERM
 | 
					#undef MAYBEPERM
 | 
				
			||||||
#undef MULT_2SPIN
 | 
					#undef MULT_2SPIN
 | 
				
			||||||
 | 
					#undef FX 
 | 
				
			||||||
 | 
					#define FX(A) DWFASM_ ## A
 | 
				
			||||||
#define MAYBEPERM(A,B) 
 | 
					#define MAYBEPERM(A,B) 
 | 
				
			||||||
#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 | 
					#define VMOVIDUP(A,B,C)                                  VBCASTIDUPf(A,B,C)
 | 
				
			||||||
#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 | 
					#define VMOVRDUP(A,B,C)                                  VBCASTRDUPf(A,B,C)
 | 
				
			||||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 | 
					#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 | 
				
			||||||
template<>
 | 
					template<>
 | 
				
			||||||
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
				
			||||||
								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								   int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
													   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 | 
				
			||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
					#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
 | 
				
			||||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
							      int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
												      int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
					template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 | 
				
			||||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
												       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
				
			||||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
					template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 | 
				
			||||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
												       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
				
			||||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
					template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 | 
				
			||||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
												       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
				
			||||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
					template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 | 
				
			||||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
												       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
				
			||||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U, 
 | 
					template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, 
 | 
				
			||||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
							       int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
												       int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);		
 | 
				
			||||||
}}
 | 
					}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,35 +1,44 @@
 | 
				
			|||||||
{
 | 
					{
 | 
				
			||||||
  int locala,perma, ptypea;
 | 
					  int local,perm, ptype;
 | 
				
			||||||
  int localb,permb, ptypeb;
 | 
					  uint64_t base;
 | 
				
			||||||
  uint64_t basea, baseb;
 | 
					  uint64_t basep;
 | 
				
			||||||
  uint64_t basex;
 | 
					 | 
				
			||||||
  const uint64_t plocal =(uint64_t) & in._odata[0];
 | 
					  const uint64_t plocal =(uint64_t) & in._odata[0];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //  vComplexF isigns[2] = { signs[0], signs[1] };
 | 
					  //  vComplexF isigns[2] = { signs[0], signs[1] };
 | 
				
			||||||
  vComplexF *isigns = &signs[0];
 | 
					  vComplexF *isigns = &signs[0];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  MASK_REGS;
 | 
					  MASK_REGS;
 | 
				
			||||||
 | 
					  int nmax=U._grid->oSites();
 | 
				
			||||||
  for(int site=0;site<Ns;site++) {
 | 
					  for(int site=0;site<Ns;site++) {
 | 
				
			||||||
 | 
					  int sU =lo.Reorder(ssU);
 | 
				
			||||||
 | 
					  int ssn=ssU+1; 
 | 
				
			||||||
 | 
					  if(ssn>=nmax) ssn=0;
 | 
				
			||||||
 | 
					  int sUn=lo.Reorder(ssn);
 | 
				
			||||||
  for(int s=0;s<Ls;s++) {
 | 
					  for(int s=0;s<Ls;s++) {
 | 
				
			||||||
 | 
					  ss =sU*Ls+s;
 | 
				
			||||||
 | 
					  ssn=sUn*Ls+s; 
 | 
				
			||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  // Xp
 | 
					  // Xp
 | 
				
			||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  int  ent=ss*8;// 2*Ndim
 | 
					  int  ent=ss*8;// 2*Ndim
 | 
				
			||||||
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
					  int nent=ssn*8;
 | 
				
			||||||
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
					 | 
				
			||||||
  basex = basea;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( locala ) {
 | 
					  PF_GAUGE(Xp); 
 | 
				
			||||||
 | 
					  base  = st.GetInfo(ptype,local,perm,Xp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH1_CHIMU(base);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  basep = st.GetPFInfo(nent,plocal); nent++;
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
    LOAD64(%r10,isigns);
 | 
					    LOAD64(%r10,isigns);
 | 
				
			||||||
    XM_PROJMEM(basea);
 | 
					    XM_PROJMEM(base);
 | 
				
			||||||
    MAYBEPERM(PERMUTE_DIR3,perma);
 | 
					    MAYBEPERM(PERMUTE_DIR3,perm);
 | 
				
			||||||
  } else { 
 | 
					  } else { 
 | 
				
			||||||
    LOAD_CHI(basea);
 | 
					    LOAD_CHI(base);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  base = st.GetInfo(ptype,local,perm,Yp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(base);
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    MULT_2SPIN_DIR_PFXP(Xp,baseb);
 | 
					    MULT_2SPIN_DIR_PFXP(Xp,basep);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  LOAD64(%r10,isigns);
 | 
					  LOAD64(%r10,isigns);
 | 
				
			||||||
  XM_RECON;
 | 
					  XM_RECON;
 | 
				
			||||||
@@ -37,16 +46,18 @@
 | 
				
			|||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  // Yp
 | 
					  // Yp
 | 
				
			||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
					  basep = st.GetPFInfo(nent,plocal); nent++;
 | 
				
			||||||
  if ( localb ) {
 | 
					  if ( local ) {
 | 
				
			||||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
    YM_PROJMEM(baseb);
 | 
					    YM_PROJMEM(base);
 | 
				
			||||||
    MAYBEPERM(PERMUTE_DIR2,permb);
 | 
					    MAYBEPERM(PERMUTE_DIR2,perm);
 | 
				
			||||||
  } else { 
 | 
					  } else { 
 | 
				
			||||||
    LOAD_CHI(baseb);
 | 
					    LOAD_CHI(base);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  base  = st.GetInfo(ptype,local,perm,Zp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(base);
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    MULT_2SPIN_DIR_PFYP(Yp,basea);
 | 
					    MULT_2SPIN_DIR_PFYP(Yp,basep);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
  YM_RECON_ACCUM;
 | 
					  YM_RECON_ACCUM;
 | 
				
			||||||
@@ -54,16 +65,18 @@
 | 
				
			|||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  // Zp
 | 
					  // Zp
 | 
				
			||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
					  basep = st.GetPFInfo(nent,plocal); nent++;
 | 
				
			||||||
  if ( locala ) {
 | 
					  if ( local ) {
 | 
				
			||||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
    ZM_PROJMEM(basea);
 | 
					    ZM_PROJMEM(base);
 | 
				
			||||||
    MAYBEPERM(PERMUTE_DIR1,perma);
 | 
					    MAYBEPERM(PERMUTE_DIR1,perm);
 | 
				
			||||||
  } else { 
 | 
					  } else { 
 | 
				
			||||||
    LOAD_CHI(basea);
 | 
					    LOAD_CHI(base);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  base  = st.GetInfo(ptype,local,perm,Tp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(base);
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    MULT_2SPIN_DIR_PFZP(Zp,baseb);
 | 
					    MULT_2SPIN_DIR_PFZP(Zp,basep);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
  ZM_RECON_ACCUM;
 | 
					  ZM_RECON_ACCUM;
 | 
				
			||||||
@@ -71,16 +84,18 @@
 | 
				
			|||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  // Tp
 | 
					  // Tp
 | 
				
			||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
					  basep = st.GetPFInfo(nent,plocal); nent++;
 | 
				
			||||||
  if ( localb ) {
 | 
					  if ( local ) {
 | 
				
			||||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
    TM_PROJMEM(baseb);
 | 
					    TM_PROJMEM(base);
 | 
				
			||||||
    MAYBEPERM(PERMUTE_DIR0,permb);
 | 
					    MAYBEPERM(PERMUTE_DIR0,perm);
 | 
				
			||||||
  } else { 
 | 
					  } else { 
 | 
				
			||||||
    LOAD_CHI(baseb);
 | 
					    LOAD_CHI(base);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  base = st.GetInfo(ptype,local,perm,Xm,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(base);
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    MULT_2SPIN_DIR_PFTP(Tp,basea);
 | 
					    MULT_2SPIN_DIR_PFTP(Tp,basep);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
  TM_RECON_ACCUM;
 | 
					  TM_RECON_ACCUM;
 | 
				
			||||||
@@ -88,16 +103,19 @@
 | 
				
			|||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  // Xm
 | 
					  // Xm
 | 
				
			||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
					  basep= (uint64_t) &out._odata[ss];
 | 
				
			||||||
  if ( locala ) {
 | 
					  //  basep= st.GetPFInfo(nent,plocal); nent++;
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
    XP_PROJMEM(basea);
 | 
					    XP_PROJMEM(base);
 | 
				
			||||||
    MAYBEPERM(PERMUTE_DIR3,perma);
 | 
					    MAYBEPERM(PERMUTE_DIR3,perm);
 | 
				
			||||||
  } else { 
 | 
					  } else { 
 | 
				
			||||||
    LOAD_CHI(basea);
 | 
					    LOAD_CHI(base);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  base = st.GetInfo(ptype,local,perm,Ym,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(base);
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    MULT_2SPIN_DIR_PFXM(Xm,baseb);
 | 
					    MULT_2SPIN_DIR_PFXM(Xm,basep);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
  XP_RECON_ACCUM;
 | 
					  XP_RECON_ACCUM;
 | 
				
			||||||
@@ -105,16 +123,18 @@
 | 
				
			|||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  // Ym
 | 
					  // Ym
 | 
				
			||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
					  basep= st.GetPFInfo(nent,plocal); nent++;
 | 
				
			||||||
  if ( localb ) {
 | 
					  if ( local ) {
 | 
				
			||||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
    YP_PROJMEM(baseb);
 | 
					    YP_PROJMEM(base);
 | 
				
			||||||
    MAYBEPERM(PERMUTE_DIR2,permb);
 | 
					    MAYBEPERM(PERMUTE_DIR2,perm);
 | 
				
			||||||
  } else { 
 | 
					  } else { 
 | 
				
			||||||
    LOAD_CHI(baseb);
 | 
					    LOAD_CHI(base);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  base = st.GetInfo(ptype,local,perm,Zm,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(base);
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    MULT_2SPIN_DIR_PFYM(Ym,basea);
 | 
					    MULT_2SPIN_DIR_PFYM(Ym,basep);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
  YP_RECON_ACCUM;
 | 
					  YP_RECON_ACCUM;
 | 
				
			||||||
@@ -122,16 +142,18 @@
 | 
				
			|||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  // Zm
 | 
					  // Zm
 | 
				
			||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
					  basep= st.GetPFInfo(nent,plocal); nent++;
 | 
				
			||||||
  if ( locala ) {
 | 
					  if ( local ) {
 | 
				
			||||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
    ZP_PROJMEM(basea);
 | 
					    ZP_PROJMEM(base);
 | 
				
			||||||
    MAYBEPERM(PERMUTE_DIR1,perma);
 | 
					    MAYBEPERM(PERMUTE_DIR1,perm);
 | 
				
			||||||
  } else { 
 | 
					  } else { 
 | 
				
			||||||
    LOAD_CHI(basea);
 | 
					    LOAD_CHI(base);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  base = st.GetInfo(ptype,local,perm,Tm,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(base);
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    MULT_2SPIN_DIR_PFZM(Zm,baseb);
 | 
					    MULT_2SPIN_DIR_PFZM(Zm,basep);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
  ZP_RECON_ACCUM;
 | 
					  ZP_RECON_ACCUM;
 | 
				
			||||||
@@ -139,26 +161,26 @@
 | 
				
			|||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  // Tm
 | 
					  // Tm
 | 
				
			||||||
  ////////////////////////////////
 | 
					  ////////////////////////////////
 | 
				
			||||||
  basea = (uint64_t)&out._odata[ss];
 | 
					  basep= st.GetPFInfo(nent,plocal); nent++;
 | 
				
			||||||
  if ( localb ) {
 | 
					  if ( local ) {
 | 
				
			||||||
    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
    TP_PROJMEM(baseb);
 | 
					    TP_PROJMEM(base);
 | 
				
			||||||
    MAYBEPERM(PERMUTE_DIR0,permb);
 | 
					    MAYBEPERM(PERMUTE_DIR0,perm);
 | 
				
			||||||
  } else { 
 | 
					  } else { 
 | 
				
			||||||
    LOAD_CHI(baseb);
 | 
					    LOAD_CHI(base);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  base= (uint64_t) &out._odata[ss];
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(base);
 | 
				
			||||||
  {
 | 
					  {
 | 
				
			||||||
    MULT_2SPIN_DIR_PFTM(Tm,basea);
 | 
					    MULT_2SPIN_DIR_PFTM(Tm,basep);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
  TP_RECON_ACCUM;
 | 
					  TP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  PREFETCH_CHIMU(basex);
 | 
					  basep= st.GetPFInfo(nent,plocal); nent++;
 | 
				
			||||||
  SAVE_RESULT(&out._odata[ss]);
 | 
					  SAVE_RESULT(base,basep);
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  ss++;
 | 
					 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  sU++;
 | 
					  ssU++;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										161
									
								
								lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										161
									
								
								lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,161 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
					  int locala,perma, ptypea;
 | 
				
			||||||
 | 
					  int localb,permb, ptypeb;
 | 
				
			||||||
 | 
					  uint64_t basea, baseb;
 | 
				
			||||||
 | 
					  const uint64_t plocal =(uint64_t) & in._odata[0];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //  vComplexF isigns[2] = { signs[0], signs[1] };
 | 
				
			||||||
 | 
					  vComplexF *isigns = &signs[0];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  MASK_REGS;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int site=0;site<Ns;site++) {
 | 
				
			||||||
 | 
					  int sU=lo.Reorder(ssU);  
 | 
				
			||||||
 | 
					  for(int s=0;s<Ls;s++) {
 | 
				
			||||||
 | 
					  ss=sU*Ls+s;
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Xp
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  int ent=ss*8;// 2*Ndim
 | 
				
			||||||
 | 
					  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( locala ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);
 | 
				
			||||||
 | 
					    XM_PROJMEM(basea);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR3,perma);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFXP(Xp,baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);
 | 
				
			||||||
 | 
					  XM_RECON;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Yp
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  if ( localb ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    YM_PROJMEM(baseb);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR2,permb);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFYP(Yp,basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  YM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Zp
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  if ( locala ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    ZM_PROJMEM(basea);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR1,perma);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFZP(Zp,baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  ZM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Tp
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  if ( localb ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    TM_PROJMEM(baseb);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR0,permb);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFTP(Tp,basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  TM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Xm
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  if ( locala ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    XP_PROJMEM(basea);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR3,perma);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFXM(Xm,baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  XP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Ym
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  if ( localb ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    YP_PROJMEM(baseb);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR2,permb);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFYM(Ym,basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  YP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Zm
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  if ( locala ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    ZP_PROJMEM(basea);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR1,perma);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFZM(Zm,baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  ZP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Tm
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  basea = (uint64_t)&out._odata[ss];
 | 
				
			||||||
 | 
					  if ( localb ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    TP_PROJMEM(baseb);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR0,permb);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal);
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFTM(Tm,basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  TP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  SAVE_RESULT(&out._odata[ss],baseb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  } 
 | 
				
			||||||
 | 
					  ssU++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										187
									
								
								lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										187
									
								
								lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,187 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
					  int locala,perma, ptypea;
 | 
				
			||||||
 | 
					  int localb,permb, ptypeb;
 | 
				
			||||||
 | 
					  int localc,permc, ptypec;
 | 
				
			||||||
 | 
					  uint64_t basea, baseb, basec;
 | 
				
			||||||
 | 
					  uint64_t basex;
 | 
				
			||||||
 | 
					  const uint64_t plocal =(uint64_t) & in._odata[0];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //  vComplexF isigns[2] = { signs[0], signs[1] };
 | 
				
			||||||
 | 
					  vComplexF *isigns = &signs[0];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  MASK_REGS;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int site=0;site<Ns;site++) {
 | 
				
			||||||
 | 
					  int sU=lo.Reorder(ssU);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int s=0;s<Ls;s++) {
 | 
				
			||||||
 | 
					  ss     =sU*Ls+s;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Xp
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  int ent=ss*8;// 2*Ndim
 | 
				
			||||||
 | 
					  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(basea);
 | 
				
			||||||
 | 
					  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(baseb);
 | 
				
			||||||
 | 
					  basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(basec);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  basex = basea;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  label(FX(XP) );
 | 
				
			||||||
 | 
					  if ( locala ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);
 | 
				
			||||||
 | 
					    XM_PROJMEM(basea);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR3,perma);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFXP(Xp,baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);
 | 
				
			||||||
 | 
					  XM_RECON;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Yp
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(basea);
 | 
				
			||||||
 | 
					  label(FX(YP) );
 | 
				
			||||||
 | 
					  if ( localb ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    YM_PROJMEM(baseb);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR2,permb);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFYP(Yp,basec);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  YM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Zp
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(baseb);
 | 
				
			||||||
 | 
					  label(FX(ZP) );
 | 
				
			||||||
 | 
					  if ( localc ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    ZM_PROJMEM(basec);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR1,permc);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(basec);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFZP(Zp,basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  ZM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Tp
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(basec);
 | 
				
			||||||
 | 
					  label(FX(TP) );
 | 
				
			||||||
 | 
					  if ( locala ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    TM_PROJMEM(basea);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR0,perma);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFTP(Tp,baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  TM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Xm
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(basea);
 | 
				
			||||||
 | 
					  label(FX(XM) );
 | 
				
			||||||
 | 
					  if ( localb ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    XP_PROJMEM(baseb);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR3,permb);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFXM(Xm,basec);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  XP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Ym
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(baseb);
 | 
				
			||||||
 | 
					  label(FX(YM) );
 | 
				
			||||||
 | 
					  if ( localc ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    YP_PROJMEM(basec);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR2,permc);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(basec);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFYM(Ym,basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  YP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Zm
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(basec);
 | 
				
			||||||
 | 
					  label(FX(ZM) );
 | 
				
			||||||
 | 
					  if ( locala ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    ZP_PROJMEM(basea);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR1,perma);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(basea);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFZM(Zm,baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  ZP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  // Tm
 | 
				
			||||||
 | 
					  ////////////////////////////////
 | 
				
			||||||
 | 
					  basea = (uint64_t)&out._odata[ss];
 | 
				
			||||||
 | 
					  PREFETCH_CHIMU(basea);
 | 
				
			||||||
 | 
					  label(FX(TM) );
 | 
				
			||||||
 | 
					  if ( localb ) {
 | 
				
			||||||
 | 
					    LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					    TP_PROJMEM(baseb);
 | 
				
			||||||
 | 
					    MAYBEPERM(PERMUTE_DIR0,permb);
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(baseb);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFTM(Tm,basec);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  LOAD64(%r10,isigns);  // times i => shuffle and xor the real part sign bit
 | 
				
			||||||
 | 
					  TP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //  PREFETCH_CHIMU(basex);
 | 
				
			||||||
 | 
					  label(FX(SAV) );
 | 
				
			||||||
 | 
					  SAVE_RESULT(&out._odata[ss]);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  ssU++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -312,7 +312,7 @@ namespace QCD {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
										       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
					       int ss,int sU,const FermionField &in, FermionField &out)
 | 
										       int ss,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -555,7 +555,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
										       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
					       int ss,int sU,const FermionField &in, FermionField &out)
 | 
										       int ss,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -803,7 +803,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
 | 
				
			|||||||
  // Specialise Gparity to simple implementation
 | 
					  // Specialise Gparity to simple implementation
 | 
				
			||||||
  ////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////
 | 
				
			||||||
template<>
 | 
					template<>
 | 
				
			||||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
												     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
							     int sF,int sU,const FermionField &in, FermionField &out)
 | 
												     int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -811,7 +811,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,Dou
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<>
 | 
					template<>
 | 
				
			||||||
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								int sF,int sU,const FermionField &in, FermionField &out)
 | 
													int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -819,7 +819,7 @@ void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<>
 | 
					template<>
 | 
				
			||||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
												     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
							     int sF,int sU,const FermionField &in, FermionField &out)
 | 
												     int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -827,7 +827,7 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,Dou
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<>
 | 
					template<>
 | 
				
			||||||
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								int sF,int sU,const FermionField &in, FermionField &out)
 | 
													int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -839,44 +839,44 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 | 
				
			|||||||
////////////// Wilson ; uses this implementation /////////////////////
 | 
					////////////// Wilson ; uses this implementation /////////////////////
 | 
				
			||||||
// Need Nc=3 though //
 | 
					// Need Nc=3 though //
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
							       int ss,int sU,const FermionField &in, FermionField &out);
 | 
												       int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
												       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
							       int ss,int sU,const FermionField &in, FermionField &out);
 | 
												       int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								  int ss,int sU,const FermionField &in, FermionField &out);
 | 
													  int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								  int ss,int sU,const FermionField &in, FermionField &out);
 | 
													  int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
													      int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
													      int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
														 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
														 int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
														 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
														 int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
													      int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
													      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
								      int ss,int sU,const FermionField &in, FermionField &out);
 | 
													      int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
														 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
														 int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
					template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
 | 
				
			||||||
									 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
														 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
									 int ss,int sU,const FermionField &in, FermionField &out);
 | 
														 int ss,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										0
									
								
								lib/qcd/hmc/.dirstamp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								lib/qcd/hmc/.dirstamp
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								lib/qcd/spin/.dirstamp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								lib/qcd/spin/.dirstamp
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										0
									
								
								lib/qcd/utils/.dirstamp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								lib/qcd/utils/.dirstamp
									
									
									
									
									
										Normal file
									
								
							@@ -28,6 +28,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#ifndef GRID_ASM_INTEL_COMMON_512_H
 | 
					#ifndef GRID_ASM_INTEL_COMMON_512_H
 | 
				
			||||||
#define GRID_ASM_INTEL_COMMON_512_H
 | 
					#define GRID_ASM_INTEL_COMMON_512_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					// Peformance options
 | 
				
			||||||
 | 
					////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					#undef  AVX512_PF_L2_WRITE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
					////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
// Opcodes common 
 | 
					// Opcodes common 
 | 
				
			||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
					////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
@@ -37,6 +42,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
           "mov     $0x5555, %%eax \n"\
 | 
					           "mov     $0x5555, %%eax \n"\
 | 
				
			||||||
           "kmovw    %%eax, %%k7 \n" : : : "%eax");
 | 
					           "kmovw    %%eax, %%k7 \n" : : : "%eax");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
 | 
					#define VZEROf(A)       "vpxorq " #A ","  #A "," #A ";\n"
 | 
				
			||||||
#define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
 | 
					#define VZEROd(A)       "vpxorq " #A ","  #A "," #A ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -86,9 +93,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
 | 
					#define VMOVf(A,DEST)   "vmovaps  " #A ", " #DEST  ";\n"
 | 
				
			||||||
#define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
 | 
					#define VMOVd(A,DEST)   "vmovapd  " #A ", " #DEST  ";\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n" 
 | 
					#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n" 
 | 
				
			||||||
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
 | 
					#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n" 
 | 
				
			||||||
 | 
					#ifdef AVX512_PF_L2_WRITE
 | 
				
			||||||
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
 | 
					#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n" 
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					#define VPREFETCHW(O,A) 
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#define VPREFETCHNTA(O,A) 
 | 
				
			||||||
 | 
					#define VPREFETCH(O,A)    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define VEVICT(O,A)   
 | 
					#define VEVICT(O,A)   
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
 | 
					//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
 | 
				
			||||||
@@ -124,8 +138,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 | 
					#define ZLOADf(OFF,PTR,ri,ir)  VLOADf(OFF,PTR,ir)  VSHUFf(ir,ri)
 | 
				
			||||||
#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 | 
					#define ZLOADd(OFF,PTR,ri,ir)  VLOADd(OFF,PTR,ir)  VSHUFd(ir,ri)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define VPREFETCHNTA(O,A) 
 | 
					 | 
				
			||||||
#define VPREFETCH(O,A)    
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
					#define VSTOREf(OFF,PTR,SRC)   "vmovaps " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
				
			||||||
#define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
					#define VSTOREd(OFF,PTR,SRC)   "vmovapd " #SRC "," #OFF "*64(" #PTR ")"  ";\n"
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -104,7 +104,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#define LOAD_CHI(PTR)	 LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
 | 
					#define LOAD_CHI(PTR)	 LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
 | 
				
			||||||
#define SAVE_UCHI(PTR)	 SAVE_UCHIi(PTR)
 | 
					#define SAVE_UCHI(PTR)	 SAVE_UCHIi(PTR)
 | 
				
			||||||
#define SAVE_CHI(PTR)	 SAVE_CHIi(PTR)
 | 
					#define SAVE_CHI(PTR)	 SAVE_CHIi(PTR)
 | 
				
			||||||
#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)
 | 
					#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define LOAD_CHIMUi \
 | 
					#define LOAD_CHIMUi \
 | 
				
			||||||
	   LOAD_CHIMU01i	\
 | 
						   LOAD_CHIMU01i	\
 | 
				
			||||||
@@ -169,22 +169,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
  VSTORE(5,%r8,Chi_12)				\
 | 
					  VSTORE(5,%r8,Chi_12)				\
 | 
				
			||||||
						);
 | 
											);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define SAVE_RESULTi(PTR)\
 | 
					 | 
				
			||||||
	   LOAD64(%r8,PTR)			\
 | 
					 | 
				
			||||||
  __asm__ (					\
 | 
					 | 
				
			||||||
	   VSTORE(0,%r8,result_00)		\
 | 
					 | 
				
			||||||
	   VSTORE(1,%r8,result_01)		\
 | 
					 | 
				
			||||||
	   VSTORE(2,%r8,result_02)		\
 | 
					 | 
				
			||||||
	   VSTORE(3,%r8,result_10)		\
 | 
					 | 
				
			||||||
	   VSTORE(4,%r8,result_11)		\
 | 
					 | 
				
			||||||
	   VSTORE(5,%r8,result_12)		\
 | 
					 | 
				
			||||||
	   VSTORE(6,%r8,result_20)		\
 | 
					 | 
				
			||||||
	   VSTORE(7,%r8,result_21)		\
 | 
					 | 
				
			||||||
	   VSTORE(8,%r8,result_22)		\
 | 
					 | 
				
			||||||
	   VSTORE(9,%r8,result_30)		\
 | 
					 | 
				
			||||||
	   VSTORE(10,%r8,result_31)		\
 | 
					 | 
				
			||||||
	   VSTORE(11,%r8,result_32) 		\
 | 
					 | 
				
			||||||
						);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
 | 
					#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
 | 
				
			||||||
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
 | 
					#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
 | 
				
			||||||
@@ -277,8 +261,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#define XM_PROJMEM(PTR) \
 | 
					#define XM_PROJMEM(PTR) \
 | 
				
			||||||
  LOAD64(%r8,PTR)\
 | 
					  LOAD64(%r8,PTR)\
 | 
				
			||||||
  __asm__ (								\
 | 
					  __asm__ (								\
 | 
				
			||||||
	   SHUF_CHIMU23i						\
 | 
					 | 
				
			||||||
	   LOAD_CHIi \
 | 
						   LOAD_CHIi \
 | 
				
			||||||
 | 
						   SHUF_CHIMU23i						\
 | 
				
			||||||
	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
 | 
						   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
 | 
				
			||||||
	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
 | 
						   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
 | 
				
			||||||
	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
 | 
						   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
 | 
				
			||||||
@@ -306,8 +290,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#define ZM_PROJMEM(PTR) \
 | 
					#define ZM_PROJMEM(PTR) \
 | 
				
			||||||
  LOAD64(%r8,PTR)							\
 | 
					  LOAD64(%r8,PTR)							\
 | 
				
			||||||
  __asm__ (								\
 | 
					  __asm__ (								\
 | 
				
			||||||
	   SHUF_CHIMU23i						\
 | 
					 | 
				
			||||||
           LOAD_CHIi \
 | 
					           LOAD_CHIi \
 | 
				
			||||||
 | 
						   SHUF_CHIMU23i						\
 | 
				
			||||||
	   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
 | 
						   VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
 | 
				
			||||||
	   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
 | 
						   VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
 | 
				
			||||||
	   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
 | 
						   VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
 | 
				
			||||||
@@ -559,21 +543,93 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
  VSUB(UChi_02,result_22,result_22)\
 | 
					  VSUB(UChi_02,result_22,result_22)\
 | 
				
			||||||
  VSUB(UChi_12,result_32,result_32) );
 | 
					  VSUB(UChi_12,result_32,result_32) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define AVX512_PF_L1
 | 
				
			||||||
 | 
					#define AVX512_PF_L2_GAUGE
 | 
				
			||||||
 | 
					#define AVX512_PF_L2_TABLE
 | 
				
			||||||
 | 
					#undef  AVX512_PF_L2_LINEAR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef AVX512_PF_L2_TABLE  
 | 
				
			||||||
 | 
					// P1 Fetches the base pointer for next link into L1 with P1
 | 
				
			||||||
 | 
					// M1 Fetches the next site pointer into L2
 | 
				
			||||||
 | 
					#define VPREFETCH_P1(A,B) VPREFETCH1(A,B)
 | 
				
			||||||
 | 
					#define VPREFETCH_P2(A,B) 
 | 
				
			||||||
 | 
					#define VPREFETCH_M1(A,B) VPREFETCH2(A,B)
 | 
				
			||||||
 | 
					#define VPREFETCH_M2(A,B) 
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef AVX512_PF_L2_LINEAR
 | 
				
			||||||
 | 
					#define VPREFETCH_M1(A,B) VPREFETCH1(A,B)
 | 
				
			||||||
 | 
					#define VPREFETCH_M2(A,B) VPREFETCH2(A,B)
 | 
				
			||||||
 | 
					#define VPREFETCH_P1(A,B) 
 | 
				
			||||||
 | 
					#define VPREFETCH_P2(A,B)
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef AVX512_PF_L2_GAUGE
 | 
				
			||||||
 | 
					#define VPREFETCH_G1(A,B)  VPREFETCH1(A,B)
 | 
				
			||||||
 | 
					#define VPREFETCH_G2(A,B)  VPREFETCH2(A,B)
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define PF_GAUGE(A) \
 | 
				
			||||||
 | 
					  LOAD64(%r8,&U._odata[sU](A))						\
 | 
				
			||||||
 | 
					  __asm__ (								\
 | 
				
			||||||
 | 
						   VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8)			\
 | 
				
			||||||
 | 
						   VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8)			\
 | 
				
			||||||
 | 
														);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define SAVE_RESULTi(PTR,pf)			\
 | 
				
			||||||
 | 
						   LOAD64(%r8,PTR)			\
 | 
				
			||||||
 | 
						   LOAD64(%r9,pf)			\
 | 
				
			||||||
 | 
					  __asm__ (					\
 | 
				
			||||||
 | 
						   VSTORE(0,%r8,result_00)	VPREFETCH_M1(0,%r9)	\
 | 
				
			||||||
 | 
						   VSTORE(1,%r8,result_01)	VPREFETCH_M1(1,%r9)	\
 | 
				
			||||||
 | 
						   VSTORE(2,%r8,result_02)	VPREFETCH_M1(2,%r9)	\
 | 
				
			||||||
 | 
						   VSTORE(3,%r8,result_10)	VPREFETCH_M1(3,%r9)	\
 | 
				
			||||||
 | 
						   VSTORE(4,%r8,result_11)	VPREFETCH_M1(4,%r9)	\
 | 
				
			||||||
 | 
						   VSTORE(5,%r8,result_12)	VPREFETCH_M1(5,%r9)	\
 | 
				
			||||||
 | 
						   VSTORE(6,%r8,result_20)	VPREFETCH_M1(6,%r9)	\
 | 
				
			||||||
 | 
						   VSTORE(7,%r8,result_21)	VPREFETCH_M1(7,%r9)	\
 | 
				
			||||||
 | 
						   VSTORE(8,%r8,result_22)	VPREFETCH_M1(8,%r9)	\
 | 
				
			||||||
 | 
						   VSTORE(9,%r8,result_30)	VPREFETCH_M1(9,%r9)	\
 | 
				
			||||||
 | 
						   VSTORE(10,%r8,result_31)	VPREFETCH_M1(10,%r9)	\
 | 
				
			||||||
 | 
						   VSTORE(11,%r8,result_32) 	VPREFETCH_M1(11,%r9)	\
 | 
				
			||||||
 | 
											);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef AVX512_PF_L2_TABLE
 | 
				
			||||||
#define PREFETCH_CHIMU(A) \
 | 
					#define PREFETCH_CHIMU(A) \
 | 
				
			||||||
  LOAD64(%r9,A)							\
 | 
					  LOAD64(%r9,A)							\
 | 
				
			||||||
  __asm__ (							\
 | 
					  __asm__ (							\
 | 
				
			||||||
  VPREFETCHG(12,%r9)\
 | 
						   VPREFETCH_P1(0,%r9)					\
 | 
				
			||||||
  VPREFETCHG(13,%r9)\
 | 
						   VPREFETCH_P1(1,%r9)					\
 | 
				
			||||||
  VPREFETCHG(14,%r9)\
 | 
						   VPREFETCH_P1(2,%r9)					\
 | 
				
			||||||
  VPREFETCHG(15,%r9)\
 | 
						   VPREFETCH_P1(3,%r9)					\
 | 
				
			||||||
  VPREFETCHG(16,%r9)\
 | 
						   VPREFETCH_P1(4,%r9)					\
 | 
				
			||||||
  VPREFETCHG(17,%r9)\
 | 
						   VPREFETCH_P1(5,%r9)					\
 | 
				
			||||||
  VPREFETCHG(18,%r9)\
 | 
						   VPREFETCH_P1(6,%r9)					\
 | 
				
			||||||
  VPREFETCHG(19,%r9)\
 | 
						   VPREFETCH_P1(7,%r9)					\
 | 
				
			||||||
  VPREFETCHG(20,%r9)\
 | 
						   VPREFETCH_P1(8,%r9)					\
 | 
				
			||||||
  VPREFETCHG(21,%r9)\
 | 
						   VPREFETCH_P1(9,%r9)					\
 | 
				
			||||||
  VPREFETCHG(22,%r9)\
 | 
						   VPREFETCH_P1(10,%r9)					\
 | 
				
			||||||
  VPREFETCHG(23,%r9));
 | 
						   VPREFETCH_P1(11,%r9));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					#define PREFETCH_CHIMU(A)
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define PREFETCH1_CHIMU(A) \
 | 
				
			||||||
 | 
					  LOAD64(%r9,A)							\
 | 
				
			||||||
 | 
					  __asm__ (							\
 | 
				
			||||||
 | 
						   VPREFETCH_P1(0,%r9)					\
 | 
				
			||||||
 | 
						   VPREFETCH_P1(1,%r9)					\
 | 
				
			||||||
 | 
						   VPREFETCH_P1(2,%r9)					\
 | 
				
			||||||
 | 
						   VPREFETCH_P1(3,%r9)					\
 | 
				
			||||||
 | 
						   VPREFETCH_P1(4,%r9)					\
 | 
				
			||||||
 | 
						   VPREFETCH_P1(5,%r9)					\
 | 
				
			||||||
 | 
						   VPREFETCH_P1(6,%r9)					\
 | 
				
			||||||
 | 
						   VPREFETCH_P1(7,%r9)					\
 | 
				
			||||||
 | 
						   VPREFETCH_P1(8,%r9)					\
 | 
				
			||||||
 | 
						   VPREFETCH_P1(9,%r9)					\
 | 
				
			||||||
 | 
						   VPREFETCH_P1(10,%r9)					\
 | 
				
			||||||
 | 
						   VPREFETCH_P1(11,%r9));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define PERMUTE_DIR0 __asm__ (			\
 | 
					#define PERMUTE_DIR0 __asm__ (			\
 | 
				
			||||||
  VPERM0(Chi_00,Chi_00)	\
 | 
					  VPERM0(Chi_00,Chi_00)	\
 | 
				
			||||||
@@ -612,15 +668,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
  LOAD64(%r8,ptr)						\
 | 
					  LOAD64(%r8,ptr)						\
 | 
				
			||||||
  LOAD64(%r9,pf)						\
 | 
					  LOAD64(%r9,pf)						\
 | 
				
			||||||
	   __asm__ (						\
 | 
						   __asm__ (						\
 | 
				
			||||||
	   VPREFETCH2(9,%r8)				   \
 | 
						   VPREFETCH_G2(9,%r8)				   \
 | 
				
			||||||
	   VPREFETCH2(10,%r8)					   \
 | 
						   VPREFETCH_G2(10,%r8)					   \
 | 
				
			||||||
	   VPREFETCH2(11,%r8)					   \
 | 
						   VPREFETCH_G2(11,%r8)					   \
 | 
				
			||||||
	   VPREFETCH2(12,%r8)					   \
 | 
						   VPREFETCH_G2(12,%r8)					   \
 | 
				
			||||||
	   VPREFETCH2(13,%r8)					   \
 | 
						   VPREFETCH_G2(13,%r8)					   \
 | 
				
			||||||
	   VPREFETCH2(14,%r8)					   \
 | 
						   VPREFETCH_G2(14,%r8)					   \
 | 
				
			||||||
	   VPREFETCH2(15,%r8)					   \
 | 
						   VPREFETCH_G2(15,%r8)					   \
 | 
				
			||||||
	   VPREFETCH2(16,%r8)					   \
 | 
						   VPREFETCH_G2(16,%r8)					   \
 | 
				
			||||||
	   VPREFETCH2(17,%r8)					   \
 | 
						   VPREFETCH_G2(17,%r8)					   \
 | 
				
			||||||
	   VSHUF(Chi_00,T1)				\
 | 
						   VSHUF(Chi_00,T1)				\
 | 
				
			||||||
	   VMOVIDUP(0,%r8,Z0 )					\
 | 
						   VMOVIDUP(0,%r8,Z0 )					\
 | 
				
			||||||
           VMOVIDUP(3,%r8,Z1 )					\
 | 
					           VMOVIDUP(3,%r8,Z1 )					\
 | 
				
			||||||
@@ -632,10 +688,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
           VMUL(Z1,T2,UChi_11)            VMOVIDUP(1,%r8,Z0 )	\
 | 
					           VMUL(Z1,T2,UChi_11)            VMOVIDUP(1,%r8,Z0 )	\
 | 
				
			||||||
           VMUL(Z2,T1,UChi_02)            VMOVIDUP(4,%r8,Z1 )	\
 | 
					           VMUL(Z2,T1,UChi_02)            VMOVIDUP(4,%r8,Z1 )	\
 | 
				
			||||||
           VMUL(Z2,T2,UChi_12)            VMOVIDUP(7,%r8,Z2 )	\
 | 
					           VMUL(Z2,T2,UChi_12)            VMOVIDUP(7,%r8,Z2 )	\
 | 
				
			||||||
	   VPREFETCHG(0,%r9)					   \
 | 
						   VPREFETCH_M1(0,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(1,%r9)					   \
 | 
						   VPREFETCH_M1(1,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(2,%r9)					   \
 | 
						   VPREFETCH_M1(2,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(3,%r9)					   \
 | 
						   VPREFETCH_M1(3,%r9)					   \
 | 
				
			||||||
	   /*18*/						\
 | 
						   /*18*/						\
 | 
				
			||||||
           VMADDSUB(Z3,Chi_00,UChi_00)    VSHUF(Chi_01,T1)	\
 | 
					           VMADDSUB(Z3,Chi_00,UChi_00)    VSHUF(Chi_01,T1)	\
 | 
				
			||||||
           VMADDSUB(Z3,Chi_10,UChi_10)				\
 | 
					           VMADDSUB(Z3,Chi_10,UChi_10)				\
 | 
				
			||||||
@@ -643,10 +699,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
           VMADDSUB(Z4,Chi_10,UChi_11)    VSHUF(Chi_11,T2)	\
 | 
					           VMADDSUB(Z4,Chi_10,UChi_11)    VSHUF(Chi_11,T2)	\
 | 
				
			||||||
           VMADDSUB(Z5,Chi_00,UChi_02)    VMOVRDUP(4,%r8,Z4 )	\
 | 
					           VMADDSUB(Z5,Chi_00,UChi_02)    VMOVRDUP(4,%r8,Z4 )	\
 | 
				
			||||||
           VMADDSUB(Z5,Chi_10,UChi_12)				\
 | 
					           VMADDSUB(Z5,Chi_10,UChi_12)				\
 | 
				
			||||||
	   VPREFETCHG(4,%r9)					   \
 | 
						   VPREFETCH_M1(4,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(5,%r9)					   \
 | 
						   VPREFETCH_M1(5,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(6,%r9)					   \
 | 
						   VPREFETCH_M1(6,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(7,%r9)					   \
 | 
						   VPREFETCH_M1(7,%r9)					   \
 | 
				
			||||||
	   /*28*/						\
 | 
						   /*28*/						\
 | 
				
			||||||
           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(7,%r8,Z5 )	\
 | 
					           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(7,%r8,Z5 )	\
 | 
				
			||||||
           VMADDSUB(Z0,T2,UChi_10)				\
 | 
					           VMADDSUB(Z0,T2,UChi_10)				\
 | 
				
			||||||
@@ -673,15 +729,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
           VMADDSUB(Z4,Chi_11,UChi_11)    VSHUF(Chi_12,T2)	\
 | 
					           VMADDSUB(Z4,Chi_11,UChi_11)    VSHUF(Chi_12,T2)	\
 | 
				
			||||||
           VMADDSUB(Z5,Chi_01,UChi_02)    VMOVRDUP(5,%r8,Z4 )	\
 | 
					           VMADDSUB(Z5,Chi_01,UChi_02)    VMOVRDUP(5,%r8,Z4 )	\
 | 
				
			||||||
           VMADDSUB(Z5,Chi_11,UChi_12)				\
 | 
					           VMADDSUB(Z5,Chi_11,UChi_12)				\
 | 
				
			||||||
	   VPREFETCHG(9,%r8)				   \
 | 
						   VPREFETCH_M1(9,%r8)				   \
 | 
				
			||||||
	   VPREFETCHG(10,%r8)					   \
 | 
						   VPREFETCH_M1(10,%r8)					   \
 | 
				
			||||||
	   VPREFETCHG(11,%r8)					   \
 | 
						   VPREFETCH_M1(11,%r8)					   \
 | 
				
			||||||
	   VPREFETCHG(12,%r8)					   \
 | 
						   VPREFETCH_M1(12,%r8)					   \
 | 
				
			||||||
	   VPREFETCHG(13,%r8)					   \
 | 
						   VPREFETCH_M1(13,%r8)					   \
 | 
				
			||||||
	   VPREFETCHG(14,%r8)					   \
 | 
						   VPREFETCH_M1(14,%r8)					   \
 | 
				
			||||||
	   VPREFETCHG(15,%r8)					   \
 | 
						   VPREFETCH_M1(15,%r8)					   \
 | 
				
			||||||
	   VPREFETCHG(16,%r8)					   \
 | 
						   VPREFETCH_M1(16,%r8)					   \
 | 
				
			||||||
	   VPREFETCHG(17,%r8)					   \
 | 
						   VPREFETCH_M1(17,%r8)					   \
 | 
				
			||||||
	   /*48*/						\
 | 
						   /*48*/						\
 | 
				
			||||||
           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(8,%r8,Z5 ) \
 | 
					           VMADDSUB(Z0,T1,UChi_00)        VMOVRDUP(8,%r8,Z5 ) \
 | 
				
			||||||
           VMADDSUB(Z0,T2,UChi_10)			      \
 | 
					           VMADDSUB(Z0,T2,UChi_10)			      \
 | 
				
			||||||
@@ -689,10 +745,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
           VMADDSUB(Z1,T2,UChi_11)			      \
 | 
					           VMADDSUB(Z1,T2,UChi_11)			      \
 | 
				
			||||||
           VMADDSUB(Z2,T1,UChi_02)			      \
 | 
					           VMADDSUB(Z2,T1,UChi_02)			      \
 | 
				
			||||||
           VMADDSUB(Z2,T2,UChi_12)			      \
 | 
					           VMADDSUB(Z2,T2,UChi_12)			      \
 | 
				
			||||||
	   VPREFETCHG(8,%r9)					   \
 | 
						   VPREFETCH_M1(8,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(9,%r9)					   \
 | 
						   VPREFETCH_M1(9,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(10,%r9)					   \
 | 
						   VPREFETCH_M1(10,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(11,%r9)					   \
 | 
						   VPREFETCH_M1(11,%r9)					   \
 | 
				
			||||||
	   /*55*/					      \
 | 
						   /*55*/					      \
 | 
				
			||||||
           VMADDSUB(Z3,Chi_02,UChi_00)			      \
 | 
					           VMADDSUB(Z3,Chi_02,UChi_00)			      \
 | 
				
			||||||
           VMADDSUB(Z3,Chi_12,UChi_10)			      \
 | 
					           VMADDSUB(Z3,Chi_12,UChi_10)			      \
 | 
				
			||||||
@@ -711,56 +767,58 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
 | 
					           VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10)   \
 | 
				
			||||||
           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
 | 
					           VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11)   \
 | 
				
			||||||
           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
 | 
					           VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12)   \
 | 
				
			||||||
	   VPREFETCHG(0,%r9)					   \
 | 
						   VPREFETCH_M1(0,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(1,%r9)					   \
 | 
						   VPREFETCH_M1(1,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(2,%r9)					   \
 | 
						   VPREFETCH_M1(2,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(3,%r9)					   \
 | 
						   VPREFETCH_M1(3,%r9)					   \
 | 
				
			||||||
	   /*8*/						   \
 | 
						   /*8*/						   \
 | 
				
			||||||
           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
 | 
					           VSHUF(Chi_01,T1)	  VSHUF(Chi_11,T2)	       	   \
 | 
				
			||||||
           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
 | 
					           VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
 | 
				
			||||||
           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
 | 
					           VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
 | 
				
			||||||
           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
 | 
					           VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
 | 
				
			||||||
	   VPREFETCHG(4,%r9)					   \
 | 
						   VPREFETCH_M1(4,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(5,%r9)					   \
 | 
						   VPREFETCH_M1(5,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(6,%r9)					   \
 | 
						   VPREFETCH_M1(6,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(7,%r9)					   \
 | 
						   VPREFETCH_M1(7,%r9)					   \
 | 
				
			||||||
	   /*16*/					  	   \
 | 
						   /*16*/					  	   \
 | 
				
			||||||
           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
 | 
					           VMADDSUBIDUP(1,%r8,T1,UChi_00)     VMADDSUBIDUP(1,%r8,T2,UChi_10)	   \
 | 
				
			||||||
           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
 | 
					           VMADDSUBIDUP(4,%r8,T1,UChi_01)     VMADDSUBIDUP(4,%r8,T2,UChi_11) \
 | 
				
			||||||
           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
 | 
					           VMADDSUBIDUP(7,%r8,T1,UChi_02)     VMADDSUBIDUP(7,%r8,T2,UChi_12) \
 | 
				
			||||||
	   VPREFETCHG(8,%r9)					   \
 | 
						   VPREFETCH_M1(8,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(9,%r9)					   \
 | 
						   VPREFETCH_M1(9,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(10,%r9)					   \
 | 
						   VPREFETCH_M1(10,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(11,%r9)					   \
 | 
						   VPREFETCH_M1(11,%r9)					   \
 | 
				
			||||||
           /*22*/						   \
 | 
					           /*22*/						   \
 | 
				
			||||||
           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
 | 
					           VSHUF(Chi_02,T1)    VSHUF(Chi_12,T2)	                   \
 | 
				
			||||||
           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
 | 
					           VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
 | 
				
			||||||
           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
 | 
					           VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
 | 
				
			||||||
           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
 | 
					           VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
 | 
				
			||||||
	   VPREFETCH2(12,%r9)					   \
 | 
						   VPREFETCH_M2(12,%r9)					   \
 | 
				
			||||||
	   VPREFETCH2(13,%r9)					   \
 | 
						   VPREFETCH_M2(13,%r9)					   \
 | 
				
			||||||
	   VPREFETCH2(14,%r9)					   \
 | 
						   VPREFETCH_M2(14,%r9)					   \
 | 
				
			||||||
	   VPREFETCH2(15,%r9)					   \
 | 
						   VPREFETCH_M2(15,%r9)					   \
 | 
				
			||||||
	   /*30*/						   \
 | 
						   /*30*/						   \
 | 
				
			||||||
           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
 | 
					           VMADDSUBIDUP(2,%r8,T1,UChi_00)     VMADDSUBIDUP(2,%r8,T2,UChi_10)	   \
 | 
				
			||||||
           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
 | 
					           VMADDSUBIDUP(5,%r8,T1,UChi_01)     VMADDSUBIDUP(5,%r8,T2,UChi_11)     \
 | 
				
			||||||
	   VPREFETCH2(16,%r9)					   \
 | 
						   VPREFETCH_M2(16,%r9)					   \
 | 
				
			||||||
	   VPREFETCH2(17,%r9)					   \
 | 
						   VPREFETCH_M2(17,%r9)					   \
 | 
				
			||||||
	   VPREFETCH2(18,%r9)					   \
 | 
						   VPREFETCH_M2(18,%r9)					   \
 | 
				
			||||||
	   VPREFETCH2(19,%r9)					   \
 | 
						   VPREFETCH_M2(19,%r9)					   \
 | 
				
			||||||
           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
 | 
					           VMADDSUBIDUP(8,%r8,T1,UChi_02)     VMADDSUBIDUP(8,%r8,T2,UChi_12)     \
 | 
				
			||||||
	   /*36*/					           \
 | 
						   /*36*/					           \
 | 
				
			||||||
           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
 | 
					           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
 | 
				
			||||||
           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
 | 
					           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
 | 
				
			||||||
           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
 | 
					           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
 | 
				
			||||||
	   VPREFETCH2(20,%r9)					   \
 | 
						   VPREFETCH_M2(20,%r9)					   \
 | 
				
			||||||
	   VPREFETCH2(21,%r9)					   \
 | 
						   VPREFETCH_M2(21,%r9)					   \
 | 
				
			||||||
	   VPREFETCH2(22,%r9)					   \
 | 
						   VPREFETCH_M2(22,%r9)					   \
 | 
				
			||||||
	   VPREFETCH2(23,%r9)					   \
 | 
						   VPREFETCH_M2(23,%r9)					   \
 | 
				
			||||||
	   VPREFETCHG(2,%r8)					   \
 | 
						   VPREFETCH_G1(2,%r8)					   \
 | 
				
			||||||
	   VPREFETCHG(3,%r8)					   \
 | 
						   VPREFETCH_G1(3,%r8)					   \
 | 
				
			||||||
	   VPREFETCH2(4,%r8)					   \
 | 
						   VPREFETCH_G2(4,%r8)					   \
 | 
				
			||||||
	   VPREFETCH2(5,%r8)					   \
 | 
						   VPREFETCH_G2(5,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCH_G2(6,%r8)					   \
 | 
				
			||||||
 | 
						   VPREFETCH_G2(7,%r8)					   \
 | 
				
			||||||
	   /*42 insns*/						);
 | 
						   /*42 insns*/						);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)				   \
 | 
					#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)				   \
 | 
				
			||||||
@@ -793,8 +851,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
 | 
					           VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
 | 
				
			||||||
           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
 | 
					           VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
 | 
				
			||||||
           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
 | 
					           VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
 | 
				
			||||||
	   /*	   VPREFETCHG(2,%r8)*/				   \
 | 
						   /*	   VPREFETCH1(2,%r8)*/				   \
 | 
				
			||||||
	   /*	   VPREFETCHG(3,%r8)*/				   \
 | 
						   /*	   VPREFETCH1(3,%r8)*/				   \
 | 
				
			||||||
	   /*42 insns*/						);
 | 
						   /*42 insns*/						);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										0
									
								
								lib/stencil/.dirstamp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								lib/stencil/.dirstamp
									
									
									
									
									
										Normal file
									
								
							@@ -49,16 +49,25 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
 | 
				
			|||||||
{
 | 
					{
 | 
				
			||||||
  grid = _grid;
 | 
					  grid = _grid;
 | 
				
			||||||
  if ( Block[0]==0) ZGraph();
 | 
					  if ( Block[0]==0) ZGraph();
 | 
				
			||||||
 | 
					  else if ( Block[1]==0) NoBlocking();
 | 
				
			||||||
  else CartesianBlocking();
 | 
					  else CartesianBlocking();
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void LebesgueOrder::NoBlocking(void) 
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
 | 
				
			||||||
 | 
					  _LebesgueReorder.resize(0);
 | 
				
			||||||
 | 
					  for ( int s = 0 ; s!= grid->oSites();s++){
 | 
				
			||||||
 | 
					    _LebesgueReorder.push_back(s);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
void LebesgueOrder::CartesianBlocking(void) 
 | 
					void LebesgueOrder::CartesianBlocking(void) 
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  _LebesgueReorder.resize(0);
 | 
					  _LebesgueReorder.resize(0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::cout << GridLogMessage << " CartesianBlocking ";
 | 
					  std::cout << GridLogDebug << " CartesianBlocking ";
 | 
				
			||||||
  for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
 | 
					  //    for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
 | 
				
			||||||
  std::cout<<std::endl; 
 | 
					  //    std::cout<<std::endl; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  IndexInteger ND = grid->_ndimension;
 | 
					  IndexInteger ND = grid->_ndimension;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -117,6 +126,7 @@ void LebesgueOrder::ZGraph(void)
 | 
				
			|||||||
{
 | 
					{
 | 
				
			||||||
  _LebesgueReorder.resize(0);
 | 
					  _LebesgueReorder.resize(0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::cout << GridLogDebug << " Lebesgue order "<<std::endl;
 | 
				
			||||||
  // Align up dimensions to power of two.
 | 
					  // Align up dimensions to power of two.
 | 
				
			||||||
  const IndexInteger one=1;
 | 
					  const IndexInteger one=1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -59,6 +59,7 @@ namespace Grid {
 | 
				
			|||||||
    // Cartesian stencil blocking strategy
 | 
					    // Cartesian stencil blocking strategy
 | 
				
			||||||
    /////////////////////////////////
 | 
					    /////////////////////////////////
 | 
				
			||||||
    static std::vector<int> Block;
 | 
					    static std::vector<int> Block;
 | 
				
			||||||
 | 
					    void NoBlocking(void);
 | 
				
			||||||
    void CartesianBlocking(void);
 | 
					    void CartesianBlocking(void);
 | 
				
			||||||
    void IterateO(int ND,int dim,
 | 
					    void IterateO(int ND,int dim,
 | 
				
			||||||
		  std::vector<IndexInteger> & xo,
 | 
							  std::vector<IndexInteger> & xo,
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user