mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Merge remote-tracking branch 'upstream/master'
Conflicts: lib/math/Grid_math_tensors.h lib/simd/Grid_vector_types.h
This commit is contained in:
		@@ -23,7 +23,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=16;lat+=4){
 | 
			
		||||
  for(int lat=4;lat<=32;lat+=2){
 | 
			
		||||
    for(int Ls=1;Ls<=16;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
@@ -94,8 +94,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=16;lat+=4){
 | 
			
		||||
  for(int lat=4;lat<=32;lat+=2){
 | 
			
		||||
    for(int Ls=1;Ls<=16;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										11
									
								
								benchmarks/Grid_su3_expr.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								benchmarks/Grid_su3_expr.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,11 @@
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
void su3_test_mult_expression(LatticeColourMatrix &z, LatticeColourMatrix &x,LatticeColourMatrix &y)
 | 
			
		||||
{
 | 
			
		||||
  z=x*y;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										11
									
								
								benchmarks/Grid_su3_test.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								benchmarks/Grid_su3_test.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,11 @@
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
void su3_test_mult_routine(LatticeColourMatrix &z, LatticeColourMatrix &x,LatticeColourMatrix &y)
 | 
			
		||||
{
 | 
			
		||||
  mult(z,x,y);
 | 
			
		||||
}
 | 
			
		||||
@@ -24,7 +24,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::vector<int> latt_size   = GridDefaultLatt();
 | 
			
		||||
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
@@ -36,11 +37,11 @@ int main (int argc, char ** argv)
 | 
			
		||||
  //  pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
  pRNG.SeedRandomDevice();
 | 
			
		||||
 | 
			
		||||
  LatticeFermion src(&Grid); random(pRNG,src);
 | 
			
		||||
  LatticeFermion src   (&Grid); random(pRNG,src);
 | 
			
		||||
  LatticeFermion result(&Grid); result=zero;
 | 
			
		||||
  LatticeFermion    ref(&Grid);    ref=zero;
 | 
			
		||||
  LatticeFermion    err(&Grid);    
 | 
			
		||||
  LatticeFermion    tmp(&Grid);    tmp=zero;
 | 
			
		||||
  LatticeFermion    err(&Grid);    tmp=zero;
 | 
			
		||||
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
 | 
			
		||||
  std::vector<LatticeColourMatrix> U(4,&Grid);
 | 
			
		||||
 | 
			
		||||
@@ -51,8 +52,11 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  // Only one non-zero (y)
 | 
			
		||||
  Umu=zero;
 | 
			
		||||
  Complex cone(1.0,0.0);
 | 
			
		||||
  for(int nn=0;nn<Nd;nn++){
 | 
			
		||||
    random(pRNG,U[nn]);
 | 
			
		||||
    if (nn!=0) U[nn]=zero;
 | 
			
		||||
    else U[nn] = cone;
 | 
			
		||||
    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -78,7 +82,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  WilsonMatrix Dw(Umu,mass);
 | 
			
		||||
  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
 | 
			
		||||
  
 | 
			
		||||
  std::cout << "Calling Dw"<<std::endl;
 | 
			
		||||
  int ncall=1000;
 | 
			
		||||
@@ -93,7 +97,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout << "norm result "<< norm2(result)<<std::endl;
 | 
			
		||||
  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
 | 
			
		||||
  std::cout << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
  err = ref -result;
 | 
			
		||||
  err = ref-result; 
 | 
			
		||||
  std::cout << "norm diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -129,9 +133,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout << "Called DwDag"<<std::endl;
 | 
			
		||||
  std::cout << "norm result "<< norm2(result)<<std::endl;
 | 
			
		||||
  std::cout << "norm ref    "<< norm2(ref)<<std::endl;
 | 
			
		||||
  err = ref -result;
 | 
			
		||||
  err = ref-result; 
 | 
			
		||||
  std::cout << "norm diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										61
									
								
								benchmarks/Grid_wilson_cg_prec.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								benchmarks/Grid_wilson_cg_prec.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,61 @@
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
template<class d>
 | 
			
		||||
struct scal {
 | 
			
		||||
  d internal;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
  Gamma::GammaMatrix Gmu [] = {
 | 
			
		||||
    Gamma::GammaX,
 | 
			
		||||
    Gamma::GammaY,
 | 
			
		||||
    Gamma::GammaZ,
 | 
			
		||||
    Gamma::GammaT
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::vector<int> latt_size   = GridDefaultLatt();
 | 
			
		||||
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion src(&Grid); random(pRNG,src);
 | 
			
		||||
  RealD nrm = norm2(src);
 | 
			
		||||
  LatticeFermion result(&Grid); result=zero;
 | 
			
		||||
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
 | 
			
		||||
 | 
			
		||||
  std::vector<LatticeColourMatrix> U(4,&Grid);
 | 
			
		||||
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    U[mu] = peekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  RealD mass=0.5;
 | 
			
		||||
  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
 | 
			
		||||
 | 
			
		||||
  //  HermitianOperator<WilsonMatrix,LatticeFermion> HermOp(Dw);
 | 
			
		||||
  //  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
 | 
			
		||||
  //  CG(HermOp,src,result);
 | 
			
		||||
  
 | 
			
		||||
  LatticeFermion    src_o(&RBGrid);
 | 
			
		||||
  LatticeFermion result_o(&RBGrid);
 | 
			
		||||
  pickCheckerboard(Odd,src_o,src);
 | 
			
		||||
  result_o=zero;
 | 
			
		||||
 | 
			
		||||
  HermitianCheckerBoardedOperator<WilsonMatrix,LatticeFermion> HermOpEO(Dw);
 | 
			
		||||
  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
 | 
			
		||||
  CG(HermOpEO,src_o,result_o);
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										48
									
								
								benchmarks/Grid_wilson_cg_schur.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								benchmarks/Grid_wilson_cg_schur.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,48 @@
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
template<class d>
 | 
			
		||||
struct scal {
 | 
			
		||||
  d internal;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
  Gamma::GammaMatrix Gmu [] = {
 | 
			
		||||
    Gamma::GammaX,
 | 
			
		||||
    Gamma::GammaY,
 | 
			
		||||
    Gamma::GammaZ,
 | 
			
		||||
    Gamma::GammaT
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::vector<int> latt_size   = GridDefaultLatt();
 | 
			
		||||
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
 | 
			
		||||
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion    src(&Grid); random(pRNG,src);
 | 
			
		||||
  LatticeFermion result(&Grid); result=zero;
 | 
			
		||||
  LatticeFermion resid(&Grid); 
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.5;
 | 
			
		||||
  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
 | 
			
		||||
 | 
			
		||||
  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
 | 
			
		||||
  SchurRedBlackSolve<LatticeFermion> SchurSolver(CG);
 | 
			
		||||
 | 
			
		||||
  SchurSolver(Dw,src,result);
 | 
			
		||||
  
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
@@ -24,13 +24,14 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::vector<int> latt_size   = GridDefaultLatt();
 | 
			
		||||
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
  GridParallelRNG          pRNG(&Grid);  pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion src(&Grid); random(pRNG,src);
 | 
			
		||||
  std::cout << "src norm" << norm2(src)<<std::endl;
 | 
			
		||||
  RealD nrm = norm2(src);
 | 
			
		||||
  LatticeFermion result(&Grid); result=zero;
 | 
			
		||||
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
 | 
			
		||||
 | 
			
		||||
@@ -46,11 +47,11 @@ int main (int argc, char ** argv)
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  RealD mass=0.5;
 | 
			
		||||
  WilsonMatrix Dw(Umu,mass);
 | 
			
		||||
  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
 | 
			
		||||
 | 
			
		||||
  HermitianOperator<WilsonMatrix,LatticeFermion> HermOp(Dw);
 | 
			
		||||
  ConjugateGradient<LatticeFermion> CG(1.0e-8,10000);
 | 
			
		||||
  CG(HermOp,src,result);
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										201
									
								
								benchmarks/Grid_wilson_evenodd.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										201
									
								
								benchmarks/Grid_wilson_evenodd.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,201 @@
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
template<class d>
 | 
			
		||||
struct scal {
 | 
			
		||||
  d internal;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
  Gamma::GammaMatrix Gmu [] = {
 | 
			
		||||
    Gamma::GammaX,
 | 
			
		||||
    Gamma::GammaY,
 | 
			
		||||
    Gamma::GammaZ,
 | 
			
		||||
    Gamma::GammaT
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> latt_size   = GridDefaultLatt();
 | 
			
		||||
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  std::cout << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
 | 
			
		||||
  GridParallelRNG          pRNG(&Grid);
 | 
			
		||||
  //  std::vector<int> seeds({1,2,3,4});
 | 
			
		||||
  //  pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
  pRNG.SeedRandomDevice();
 | 
			
		||||
 | 
			
		||||
  LatticeFermion src   (&Grid); random(pRNG,src);
 | 
			
		||||
  LatticeFermion phi   (&Grid); random(pRNG,phi);
 | 
			
		||||
  LatticeFermion chi   (&Grid); random(pRNG,chi);
 | 
			
		||||
  LatticeFermion result(&Grid); result=zero;
 | 
			
		||||
  LatticeFermion    ref(&Grid);    ref=zero;
 | 
			
		||||
  LatticeFermion    tmp(&Grid);    tmp=zero;
 | 
			
		||||
  LatticeFermion    err(&Grid);    tmp=zero;
 | 
			
		||||
  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
 | 
			
		||||
  std::vector<LatticeColourMatrix> U(4,&Grid);
 | 
			
		||||
 | 
			
		||||
  double volume=1;
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    volume=volume*latt_size[mu];
 | 
			
		||||
  }  
 | 
			
		||||
 | 
			
		||||
  // Only one non-zero (y)
 | 
			
		||||
  Umu=zero;
 | 
			
		||||
  for(int nn=0;nn<Nd;nn++){
 | 
			
		||||
    random(pRNG,U[nn]);
 | 
			
		||||
    pokeIndex<LorentzIndex>(Umu,U[nn],nn);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
 | 
			
		||||
  WilsonMatrix Dw(Umu,Grid,RBGrid,mass);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion src_e   (&RBGrid);
 | 
			
		||||
  LatticeFermion src_o   (&RBGrid);
 | 
			
		||||
  LatticeFermion r_e   (&RBGrid);
 | 
			
		||||
  LatticeFermion r_o   (&RBGrid);
 | 
			
		||||
  LatticeFermion r_eo  (&Grid);
 | 
			
		||||
 | 
			
		||||
  const int Even=0;
 | 
			
		||||
  const int Odd=1;
 | 
			
		||||
  std::cout<<"=========================================================="<<std::endl;
 | 
			
		||||
  std::cout<<"= Testing that Deo + Doe = Dunprec "<<std::endl;
 | 
			
		||||
  std::cout<<"=========================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  pickCheckerboard(Even,src_e,src);
 | 
			
		||||
  pickCheckerboard(Odd,src_o,src);
 | 
			
		||||
 | 
			
		||||
  Dw.Meooe(src_e,r_o);  std::cout<<"Applied Meo"<<std::endl;
 | 
			
		||||
  Dw.Meooe(src_o,r_e);  std::cout<<"Applied Moe"<<std::endl;
 | 
			
		||||
  Dw.Dhop (src,ref,0);
 | 
			
		||||
 | 
			
		||||
  setCheckerboard(r_eo,r_o);
 | 
			
		||||
  setCheckerboard(r_eo,r_e);
 | 
			
		||||
 | 
			
		||||
  ref = (-0.5)*ref;
 | 
			
		||||
  err= ref - r_eo;
 | 
			
		||||
  std::cout << "EO norm diff   "<< norm2(err)<< " "<<norm2(ref)<< " " << norm2(r_eo) <<std::endl;
 | 
			
		||||
 | 
			
		||||
  LatticeComplex cerr(&Grid);
 | 
			
		||||
  cerr = localInnerProduct(err,err);
 | 
			
		||||
 | 
			
		||||
  std::cout<<"=============================================================="<<std::endl;
 | 
			
		||||
  std::cout<<"= Test Ddagger is the dagger of D by requiring                "<<std::endl;
 | 
			
		||||
  std::cout<<"=  < phi | Deo | chi > * = < chi | Deo^dag| phi>  "<<std::endl;
 | 
			
		||||
  std::cout<<"=============================================================="<<std::endl;
 | 
			
		||||
  
 | 
			
		||||
  LatticeFermion chi_e   (&RBGrid);
 | 
			
		||||
  LatticeFermion chi_o   (&RBGrid);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion dchi_e  (&RBGrid);
 | 
			
		||||
  LatticeFermion dchi_o  (&RBGrid);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion phi_e   (&RBGrid);
 | 
			
		||||
  LatticeFermion phi_o   (&RBGrid);
 | 
			
		||||
 | 
			
		||||
  LatticeFermion dphi_e  (&RBGrid);
 | 
			
		||||
  LatticeFermion dphi_o  (&RBGrid);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  pickCheckerboard(Even,chi_e,chi);
 | 
			
		||||
  pickCheckerboard(Odd ,chi_o,chi);
 | 
			
		||||
  pickCheckerboard(Even,phi_e,phi);
 | 
			
		||||
  pickCheckerboard(Odd ,phi_o,phi);
 | 
			
		||||
 | 
			
		||||
  Dw.Meooe(chi_e,dchi_o);
 | 
			
		||||
  Dw.Meooe(chi_o,dchi_e);
 | 
			
		||||
  Dw.MeooeDag(phi_e,dphi_o);
 | 
			
		||||
  Dw.MeooeDag(phi_o,dphi_e);
 | 
			
		||||
 | 
			
		||||
  ComplexD pDce = innerProduct(phi_e,dchi_e);
 | 
			
		||||
  ComplexD pDco = innerProduct(phi_o,dchi_o);
 | 
			
		||||
  ComplexD cDpe = innerProduct(chi_e,dphi_e);
 | 
			
		||||
  ComplexD cDpo = innerProduct(chi_o,dphi_o);
 | 
			
		||||
 | 
			
		||||
  std::cout <<"e "<<pDce<<" "<<cDpe <<std::endl;
 | 
			
		||||
  std::cout <<"o "<<pDco<<" "<<cDpo <<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout <<"pDce - conj(cDpo) "<< pDce-conj(cDpo) <<std::endl;
 | 
			
		||||
  std::cout <<"pDco - conj(cDpe) "<< pDco-conj(cDpe) <<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout<<"=============================================================="<<std::endl;
 | 
			
		||||
  std::cout<<"= Test MeeInv Mee = 1                                         "<<std::endl;
 | 
			
		||||
  std::cout<<"=============================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  pickCheckerboard(Even,chi_e,chi);
 | 
			
		||||
  pickCheckerboard(Odd ,chi_o,chi);
 | 
			
		||||
 | 
			
		||||
  Dw.Mooee(chi_e,src_e);
 | 
			
		||||
  Dw.MooeeInv(src_e,phi_e);
 | 
			
		||||
 | 
			
		||||
  Dw.Mooee(chi_o,src_o);
 | 
			
		||||
  Dw.MooeeInv(src_o,phi_o);
 | 
			
		||||
  
 | 
			
		||||
  setCheckerboard(phi,phi_e);
 | 
			
		||||
  setCheckerboard(phi,phi_o);
 | 
			
		||||
 | 
			
		||||
  err = phi-chi;
 | 
			
		||||
  std::cout << "norm diff   "<< norm2(err)<< std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout<<"=============================================================="<<std::endl;
 | 
			
		||||
  std::cout<<"= Test MeeInvDag MeeDag = 1                                   "<<std::endl;
 | 
			
		||||
  std::cout<<"=============================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  pickCheckerboard(Even,chi_e,chi);
 | 
			
		||||
  pickCheckerboard(Odd ,chi_o,chi);
 | 
			
		||||
 | 
			
		||||
  Dw.MooeeDag(chi_e,src_e);
 | 
			
		||||
  Dw.MooeeInvDag(src_e,phi_e);
 | 
			
		||||
 | 
			
		||||
  Dw.MooeeDag(chi_o,src_o);
 | 
			
		||||
  Dw.MooeeInvDag(src_o,phi_o);
 | 
			
		||||
  
 | 
			
		||||
  setCheckerboard(phi,phi_e);
 | 
			
		||||
  setCheckerboard(phi,phi_o);
 | 
			
		||||
 | 
			
		||||
  err = phi-chi;
 | 
			
		||||
  std::cout << "norm diff   "<< norm2(err)<< std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout<<"=============================================================="<<std::endl;
 | 
			
		||||
  std::cout<<"= Test MpcDagMpc is Hermitian              "<<std::endl;
 | 
			
		||||
  std::cout<<"=============================================================="<<std::endl;
 | 
			
		||||
  
 | 
			
		||||
  random(pRNG,phi);
 | 
			
		||||
  random(pRNG,chi);
 | 
			
		||||
  pickCheckerboard(Even,chi_e,chi);
 | 
			
		||||
  pickCheckerboard(Odd ,chi_o,chi);
 | 
			
		||||
  pickCheckerboard(Even,phi_e,phi);
 | 
			
		||||
  pickCheckerboard(Odd ,phi_o,phi);
 | 
			
		||||
  RealD t1,t2;
 | 
			
		||||
 | 
			
		||||
  Dw.MpcDagMpc(chi_e,dchi_e,t1,t2);
 | 
			
		||||
  Dw.MpcDagMpc(chi_o,dchi_o,t1,t2);
 | 
			
		||||
 | 
			
		||||
  Dw.MpcDagMpc(phi_e,dphi_e,t1,t2);
 | 
			
		||||
  Dw.MpcDagMpc(phi_o,dphi_o,t1,t2);
 | 
			
		||||
 | 
			
		||||
  pDce = innerProduct(phi_e,dchi_e);
 | 
			
		||||
  pDco = innerProduct(phi_o,dchi_o);
 | 
			
		||||
  cDpe = innerProduct(chi_e,dphi_e);
 | 
			
		||||
  cDpo = innerProduct(chi_o,dphi_o);
 | 
			
		||||
 | 
			
		||||
  std::cout <<"e "<<pDce<<" "<<cDpe <<std::endl;
 | 
			
		||||
  std::cout <<"o "<<pDco<<" "<<cDpo <<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout <<"pDce - conj(cDpo) "<< pDco-conj(cDpo) <<std::endl;
 | 
			
		||||
  std::cout <<"pDco - conj(cDpe) "<< pDce-conj(cDpe) <<std::endl;
 | 
			
		||||
  
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
@@ -5,18 +5,27 @@ AM_LDFLAGS = -L$(top_builddir)/lib
 | 
			
		||||
#
 | 
			
		||||
# Test code
 | 
			
		||||
#
 | 
			
		||||
bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 Grid_wilson_cg_unprec
 | 
			
		||||
bin_PROGRAMS = Grid_wilson Grid_comms Grid_memory_bandwidth Grid_su3 Grid_wilson_cg_unprec Grid_wilson_evenodd  Grid_wilson_cg_prec Grid_wilson_cg_schur
 | 
			
		||||
 | 
			
		||||
Grid_wilson_SOURCES = Grid_wilson.cc
 | 
			
		||||
Grid_wilson_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
Grid_wilson_evenodd_SOURCES = Grid_wilson_evenodd.cc
 | 
			
		||||
Grid_wilson_evenodd_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
Grid_wilson_cg_unprec_SOURCES = Grid_wilson_cg_unprec.cc
 | 
			
		||||
Grid_wilson_cg_unprec_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
Grid_wilson_cg_prec_SOURCES = Grid_wilson_cg_prec.cc
 | 
			
		||||
Grid_wilson_cg_prec_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
Grid_wilson_cg_schur_SOURCES = Grid_wilson_cg_schur.cc
 | 
			
		||||
Grid_wilson_cg_schur_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
Grid_comms_SOURCES = Grid_comms.cc
 | 
			
		||||
Grid_comms_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
Grid_su3_SOURCES = Grid_su3.cc
 | 
			
		||||
Grid_su3_SOURCES = Grid_su3.cc Grid_su3_test.cc Grid_su3_expr.cc
 | 
			
		||||
Grid_su3_LDADD = -lGrid
 | 
			
		||||
 | 
			
		||||
Grid_memory_bandwidth_SOURCES = Grid_memory_bandwidth.cc
 | 
			
		||||
 
 | 
			
		||||
@@ -11,6 +11,7 @@
 | 
			
		||||
#include <sys/time.h>
 | 
			
		||||
#include <signal.h>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <iterator>
 | 
			
		||||
#include <Grid.h>
 | 
			
		||||
#include <algorithm>
 | 
			
		||||
#include <iterator>
 | 
			
		||||
 
 | 
			
		||||
@@ -13,28 +13,6 @@
 | 
			
		||||
 | 
			
		||||
typedef uint32_t Integer;
 | 
			
		||||
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
#include <pmmintrin.h>
 | 
			
		||||
#endif
 | 
			
		||||
#if defined(AVX1) || defined (AVX2)
 | 
			
		||||
#include <immintrin.h>
 | 
			
		||||
    
 | 
			
		||||
// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
 | 
			
		||||
#ifndef _mm256_set_m128i
 | 
			
		||||
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
#include <immintrin.h>
 | 
			
		||||
#ifndef KNC_ONLY_STORES
 | 
			
		||||
#define  _mm512_storenrngo_ps _mm512_store_ps  // not present in AVX512
 | 
			
		||||
#define  _mm512_storenrngo_pd _mm512_store_pd  // not present in AVX512
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  typedef  float  RealF;
 | 
			
		||||
@@ -117,7 +95,7 @@ namespace Grid {
 | 
			
		||||
  template<>            inline void zeroit(RealF &arg){ arg=0; };
 | 
			
		||||
  template<>            inline void zeroit(RealD &arg){ arg=0; };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  // Eventually delete this part
 | 
			
		||||
#if defined (SSE4)
 | 
			
		||||
    typedef __m128 fvec;
 | 
			
		||||
    typedef __m128d dvec;
 | 
			
		||||
@@ -146,6 +124,7 @@ namespace Grid {
 | 
			
		||||
    typedef vector4double dvec;
 | 
			
		||||
    typedef vector4double zvec;
 | 
			
		||||
#endif
 | 
			
		||||
  
 | 
			
		||||
#if defined (AVX1) || defined (AVX2) || defined (AVX512)
 | 
			
		||||
    inline void v_prefetch0(int size, const char *ptr){
 | 
			
		||||
          for(int i=0;i<size;i+=64){ //  Define L1 linesize above// What about SSE?
 | 
			
		||||
 
 | 
			
		||||
@@ -120,8 +120,8 @@ namespace Grid {
 | 
			
		||||
	  // Map to always positive shift modulo global full dimension.
 | 
			
		||||
	  int shift = (displacement+fd)%fd;
 | 
			
		||||
	  
 | 
			
		||||
     	  int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift);
 | 
			
		||||
	  assert (checkerboard== _checkerboard);
 | 
			
		||||
	  //     	  int checkerboard = _grid->CheckerBoardDestination(source.checkerboard,shift);
 | 
			
		||||
	  assert (source.checkerboard== _checkerboard);
 | 
			
		||||
 | 
			
		||||
	  // the permute type
 | 
			
		||||
	  int simd_layout     = _grid->_simd_layout[dimension];
 | 
			
		||||
 
 | 
			
		||||
@@ -5,6 +5,8 @@
 | 
			
		||||
#define GRID_OMP
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define UNROLL  _Pragma("unroll")
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#include <omp.h>
 | 
			
		||||
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for")
 | 
			
		||||
 
 | 
			
		||||
@@ -25,7 +25,7 @@ namespace Grid {
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    template<class Field> class HermitianOperatorBase : public LinearOperatorBase<Field> {
 | 
			
		||||
    public:
 | 
			
		||||
      virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2);
 | 
			
		||||
      virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2)=0;
 | 
			
		||||
      void AdjOp(const Field &in, Field &out) {
 | 
			
		||||
	Op(in,out);
 | 
			
		||||
      };
 | 
			
		||||
@@ -106,6 +106,10 @@ namespace Grid {
 | 
			
		||||
    public:
 | 
			
		||||
      virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
 | 
			
		||||
    };
 | 
			
		||||
    template<class Field> class HermitianOperatorFunction {
 | 
			
		||||
    public:
 | 
			
		||||
      virtual void operator() (HermitianOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    // FIXME : To think about
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -10,6 +10,7 @@ namespace Grid {
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    template<class Field> class SparseMatrixBase {
 | 
			
		||||
    public:
 | 
			
		||||
      GridBase *_grid;
 | 
			
		||||
      // Full checkerboar operations
 | 
			
		||||
      virtual RealD M    (const Field &in, Field &out)=0;
 | 
			
		||||
      virtual RealD Mdag (const Field &in, Field &out)=0;
 | 
			
		||||
@@ -18,6 +19,7 @@ namespace Grid {
 | 
			
		||||
	ni=M(in,tmp);
 | 
			
		||||
	no=Mdag(tmp,out);
 | 
			
		||||
      }
 | 
			
		||||
      SparseMatrixBase(GridBase *grid) : _grid(grid) {};
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -25,7 +27,7 @@ namespace Grid {
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
 | 
			
		||||
    public:
 | 
			
		||||
      
 | 
			
		||||
      GridBase *_cbgrid;
 | 
			
		||||
      // half checkerboard operaions
 | 
			
		||||
      virtual  void Meooe    (const Field &in, Field &out)=0;
 | 
			
		||||
      virtual  void Mooee    (const Field &in, Field &out)=0;
 | 
			
		||||
@@ -44,9 +46,7 @@ namespace Grid {
 | 
			
		||||
	Meooe(out,tmp);
 | 
			
		||||
 | 
			
		||||
	Mooee(in,out);
 | 
			
		||||
	out=out-tmp; // axpy_norm
 | 
			
		||||
	RealD n=norm2(out);
 | 
			
		||||
	return n;
 | 
			
		||||
	return axpy_norm(out,-1.0,tmp,out);
 | 
			
		||||
      }
 | 
			
		||||
      virtual  RealD MpcDag   (const Field &in, Field &out){
 | 
			
		||||
	Field tmp(in._grid);
 | 
			
		||||
@@ -56,15 +56,15 @@ namespace Grid {
 | 
			
		||||
	MeooeDag(out,tmp);
 | 
			
		||||
 | 
			
		||||
	MooeeDag(in,out);
 | 
			
		||||
	out=out-tmp; // axpy_norm
 | 
			
		||||
	RealD n=norm2(out);
 | 
			
		||||
	return n;
 | 
			
		||||
	return axpy_norm(out,-1.0,tmp,out);
 | 
			
		||||
      }
 | 
			
		||||
      virtual void MpcDagMpc(const Field &in, Field &out,RealD ni,RealD no) {
 | 
			
		||||
      virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
 | 
			
		||||
	Field tmp(in._grid);
 | 
			
		||||
	ni=Mpc(in,tmp);
 | 
			
		||||
	no=Mpc(tmp,out);
 | 
			
		||||
	no=MpcDag(tmp,out);
 | 
			
		||||
	//	std::cout<<"MpcDagMpc "<<ni<<" "<<no<<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
      CheckerBoardedSparseMatrixBase(GridBase *grid,GridBase *cbgrid) : SparseMatrixBase<Field>(grid), _cbgrid(cbgrid) {};
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -31,7 +31,7 @@ public:
 | 
			
		||||
  bigfloat(const double d) { mpf_init_set_d(x, d); }  
 | 
			
		||||
  bigfloat(const char *str) { mpf_init_set_str(x, (char*)str, 10); }
 | 
			
		||||
  ~bigfloat(void) { mpf_clear(x); }
 | 
			
		||||
  operator const double (void) const { return (double)mpf_get_d(x); }
 | 
			
		||||
  operator double (void) const { return (double)mpf_get_d(x); }
 | 
			
		||||
  static void setDefaultPrecision(unsigned long dprec) {
 | 
			
		||||
    unsigned long bprec =  (unsigned long)(3.321928094 * (double)dprec);
 | 
			
		||||
    mpf_set_default_prec(bprec);
 | 
			
		||||
 
 | 
			
		||||
@@ -9,18 +9,21 @@ namespace Grid {
 | 
			
		||||
    /////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
  template<class Field> 
 | 
			
		||||
    class ConjugateGradient :  public OperatorFunction<Field> {
 | 
			
		||||
    class ConjugateGradient : public HermitianOperatorFunction<Field> {
 | 
			
		||||
public:                                                
 | 
			
		||||
    RealD   Tolerance;
 | 
			
		||||
    Integer MaxIterations;
 | 
			
		||||
 | 
			
		||||
    int verbose;
 | 
			
		||||
    ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { 
 | 
			
		||||
      std::cout << Tolerance<<std::endl;
 | 
			
		||||
      verbose=0;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi) {assert(0);};
 | 
			
		||||
 | 
			
		||||
    void operator() (HermitianOperatorBase<Field> &Linop,const Field &src, Field &psi){
 | 
			
		||||
 | 
			
		||||
      psi.checkerboard = src.checkerboard;
 | 
			
		||||
      conformable(psi,src);
 | 
			
		||||
 | 
			
		||||
      RealD cp,c,a,d,b,ssq,qq,b_pred;
 | 
			
		||||
      
 | 
			
		||||
      Field   p(src);
 | 
			
		||||
@@ -39,12 +42,14 @@ public:
 | 
			
		||||
      cp =a;
 | 
			
		||||
      ssq=norm2(src);
 | 
			
		||||
 | 
			
		||||
      std::cout <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
 | 
			
		||||
      std::cout <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
 | 
			
		||||
      std::cout <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
 | 
			
		||||
      std::cout <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
 | 
			
		||||
      std::cout <<std::setprecision(4)<< "ConjugateGradient:     r "<<cp   <<std::endl;
 | 
			
		||||
      std::cout <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
 | 
			
		||||
      if ( verbose ) {
 | 
			
		||||
	std::cout <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl;
 | 
			
		||||
	std::cout <<std::setprecision(4)<< "ConjugateGradient:   src "<<ssq  <<std::endl;
 | 
			
		||||
	std::cout <<std::setprecision(4)<< "ConjugateGradient:    mp "<<d    <<std::endl;
 | 
			
		||||
	std::cout <<std::setprecision(4)<< "ConjugateGradient:   mmp "<<b    <<std::endl;
 | 
			
		||||
	std::cout <<std::setprecision(4)<< "ConjugateGradient:  cp,r "<<cp   <<std::endl;
 | 
			
		||||
	std::cout <<std::setprecision(4)<< "ConjugateGradient:     p "<<a    <<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      RealD rsq =  Tolerance* Tolerance*ssq;
 | 
			
		||||
      
 | 
			
		||||
@@ -62,13 +67,15 @@ public:
 | 
			
		||||
	
 | 
			
		||||
	Linop.OpAndNorm(p,mmp,d,qq);
 | 
			
		||||
 | 
			
		||||
	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  d,qq "<<d<< " "<<qq <<std::endl;
 | 
			
		||||
	RealD    qqck = norm2(mmp);
 | 
			
		||||
	ComplexD dck  = innerProduct(p,mmp);
 | 
			
		||||
	//	if (verbose) std::cout <<std::setprecision(4)<< "ConjugateGradient:  d,qq "<<d<< " "<<qq <<" qqcheck "<< qqck<< " dck "<< dck<<std::endl;
 | 
			
		||||
      
 | 
			
		||||
	a      = c/d;
 | 
			
		||||
	b_pred = a*(a*qq-d)/c;
 | 
			
		||||
 | 
			
		||||
	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  a,bp "<<a<< " "<<b_pred <<std::endl;
 | 
			
		||||
 | 
			
		||||
	//	if (verbose) std::cout <<std::setprecision(4)<< "ConjugateGradient:  a,bp "<<a<< " "<<b_pred <<std::endl;
 | 
			
		||||
	cp = axpy_norm(r,-a,mmp,r);
 | 
			
		||||
	b = cp/c;
 | 
			
		||||
	//	std::cout <<std::setprecision(4)<< "ConjugateGradient:  cp,b "<<cp<< " "<<b <<std::endl;
 | 
			
		||||
@@ -77,7 +84,16 @@ public:
 | 
			
		||||
	psi= a*p+psi;
 | 
			
		||||
	p  = p*b+r;
 | 
			
		||||
	  
 | 
			
		||||
	std::cout<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 | 
			
		||||
	if (verbose) std::cout<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
 | 
			
		||||
 | 
			
		||||
	// Hack
 | 
			
		||||
	if (0) {
 | 
			
		||||
	  Field   tt(src);
 | 
			
		||||
      	  Linop.Op(psi,mmp);
 | 
			
		||||
	  tt=mmp-src;
 | 
			
		||||
	  RealD resnorm = norm2(tt);
 | 
			
		||||
	  std::cout<<"ConjugateGradient: Iteration " <<k<<" true residual "<<resnorm << " computed " << cp <<std::endl;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Stopping condition
 | 
			
		||||
	if ( cp <= rsq ) { 
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,7 @@
 | 
			
		||||
#ifndef GRID_SCHUR_RED_BLACK_H
 | 
			
		||||
#define GRID_SCHUR_RED_BLACK_H
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  /*
 | 
			
		||||
   * Red black Schur decomposition
 | 
			
		||||
   *
 | 
			
		||||
@@ -25,53 +26,50 @@
 | 
			
		||||
   *     M psi = eta
 | 
			
		||||
   ***********************
 | 
			
		||||
   *Odd
 | 
			
		||||
   * i)   (D_oo)^{\dag} D_oo psi_o = (D_oo)^\dag L^{-1}  eta_o
 | 
			
		||||
   *                        eta_o' = D_oo (eta_o - Moe Mee^{-1} eta_e)
 | 
			
		||||
   * i)   (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1}  eta_o
 | 
			
		||||
   *                        eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
 | 
			
		||||
   *Even
 | 
			
		||||
   * ii)  Mee psi_e + Meo psi_o = src_e
 | 
			
		||||
   *
 | 
			
		||||
   *   => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
 | 
			
		||||
   *
 | 
			
		||||
   */
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Take a matrix and form a Red Black solver calling a Herm solver
 | 
			
		||||
  // Use of RB info prevents making SchurRedBlackSolve conform to standard interface
 | 
			
		||||
  ///////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template<class Field> class SchurRedBlackSolve : public OperatorFunction<Field>{
 | 
			
		||||
  template<class Field> class SchurRedBlackSolve {
 | 
			
		||||
  private:
 | 
			
		||||
    SparseMatrixBase<Field> & _Matrix;
 | 
			
		||||
    OperatorFunction<Field> & _HermitianRBSolver;
 | 
			
		||||
    HermitianOperatorFunction<Field> & _HermitianRBSolver;
 | 
			
		||||
    int CBfactorise;
 | 
			
		||||
  public:
 | 
			
		||||
 | 
			
		||||
    /////////////////////////////////////////////////////
 | 
			
		||||
    // Wrap the usual normal equations Schur trick
 | 
			
		||||
    /////////////////////////////////////////////////////
 | 
			
		||||
  SchurRedBlackSolve(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianRBSolver) 
 | 
			
		||||
    :  _Matrix(Matrix), _HermitianRBSolver(HermitianRBSolver) { 
 | 
			
		||||
  SchurRedBlackSolve(HermitianOperatorFunction<Field> &HermitianRBSolver)  :
 | 
			
		||||
     _HermitianRBSolver(HermitianRBSolver) 
 | 
			
		||||
    { 
 | 
			
		||||
      CBfactorise=0;
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    void operator() (const Field &in, Field &out){
 | 
			
		||||
    template<class Matrix>
 | 
			
		||||
      void operator() (Matrix & _Matrix,const Field &in, Field &out){
 | 
			
		||||
 | 
			
		||||
      // FIXME CGdiagonalMee not implemented virtual function
 | 
			
		||||
      // FIXME need to make eo grid from full grid.
 | 
			
		||||
      // FIXME use CBfactorise to control schur decomp
 | 
			
		||||
      const int Even=0;
 | 
			
		||||
      const int Odd =1;
 | 
			
		||||
      GridBase *grid = _Matrix._cbgrid;
 | 
			
		||||
      GridBase *fgrid= _Matrix._grid;
 | 
			
		||||
 
 | 
			
		||||
      // Make a cartesianRedBlack from full Grid
 | 
			
		||||
      GridRedBlackCartesian grid(in._grid);
 | 
			
		||||
 
 | 
			
		||||
      Field src_e(&grid);
 | 
			
		||||
      Field src_o(&grid);
 | 
			
		||||
      Field sol_e(&grid);
 | 
			
		||||
      Field sol_o(&grid);
 | 
			
		||||
      Field   tmp(&grid);
 | 
			
		||||
      Field  Mtmp(&grid);
 | 
			
		||||
      Field src_e(grid);
 | 
			
		||||
      Field src_o(grid);
 | 
			
		||||
      Field sol_e(grid);
 | 
			
		||||
      Field sol_o(grid);
 | 
			
		||||
      Field   tmp(grid);
 | 
			
		||||
      Field  Mtmp(grid);
 | 
			
		||||
      Field resid(fgrid);
 | 
			
		||||
 | 
			
		||||
      pickCheckerboard(Even,src_e,in);
 | 
			
		||||
      pickCheckerboard(Odd ,src_o,in);
 | 
			
		||||
@@ -79,26 +77,35 @@ namespace Grid {
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      // src_o = Mdag * (source_o - Moe MeeInv source_e)
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      _Matrix.MooeeInv(src_e,tmp);     //    MooeeInv(source[Even],tmp,DaggerNo,Even);
 | 
			
		||||
      _Matrix.Meooe   (tmp,Mtmp);      //    Meo     (tmp,src,Odd,DaggerNo);
 | 
			
		||||
      tmp=src_o-Mtmp;                  //    axpy    (tmp,src,source[Odd],-1.0);
 | 
			
		||||
      _Matrix.MpcDag(tmp,src_o);       //    Mprec(tmp,src,Mtmp,DaggerYes);  
 | 
			
		||||
      _Matrix.MooeeInv(src_e,tmp);     assert(  tmp.checkerboard ==Even);
 | 
			
		||||
      _Matrix.Meooe   (tmp,Mtmp);      assert( Mtmp.checkerboard ==Odd);     
 | 
			
		||||
      tmp=src_o-Mtmp;                  assert(  tmp.checkerboard ==Odd);     
 | 
			
		||||
      _Matrix.MpcDag(tmp,src_o);       assert(src_o.checkerboard ==Odd);       
 | 
			
		||||
 | 
			
		||||
      //////////////////////////////////////////////////////////////
 | 
			
		||||
      // Call the red-black solver
 | 
			
		||||
      //////////////////////////////////////////////////////////////
 | 
			
		||||
      _HermitianRBSolver(src_o,sol_o); //  CGNE_prec_MdagM(solution[Odd],src);
 | 
			
		||||
      HermitianCheckerBoardedOperator<Matrix,Field> _HermOpEO(_Matrix);
 | 
			
		||||
      std::cout << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
 | 
			
		||||
      _HermitianRBSolver(_HermOpEO,src_o,sol_o);  assert(sol_o.checkerboard==Odd);
 | 
			
		||||
 | 
			
		||||
      ///////////////////////////////////////////////////
 | 
			
		||||
      // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
 | 
			
		||||
      ///////////////////////////////////////////////////
 | 
			
		||||
      _Matrix.Meooe(sol_o,tmp);        // Meo(solution[Odd],tmp,Even,DaggerNo);
 | 
			
		||||
      src_e = src_e-tmp;               // axpy(src,tmp,source[Even],-1.0);
 | 
			
		||||
      _Matrix.MooeeInv(src_e,sol_e);   // MooeeInv(src,solution[Even],DaggerNo,Even);
 | 
			
		||||
      _Matrix.Meooe(sol_o,tmp);        assert(  tmp.checkerboard   ==Even);
 | 
			
		||||
      src_e = src_e-tmp;               assert(  src_e.checkerboard ==Even);
 | 
			
		||||
      _Matrix.MooeeInv(src_e,sol_e);   assert(  sol_e.checkerboard ==Even);
 | 
			
		||||
     
 | 
			
		||||
      setCheckerboard(out,sol_e);
 | 
			
		||||
      setCheckerboard(out,sol_o);
 | 
			
		||||
      setCheckerboard(out,sol_e); assert(  sol_e.checkerboard ==Even);
 | 
			
		||||
      setCheckerboard(out,sol_o); assert(  sol_o.checkerboard ==Odd );
 | 
			
		||||
 | 
			
		||||
      // Verify the unprec residual
 | 
			
		||||
      _Matrix.M(out,resid); 
 | 
			
		||||
      resid = resid-in;
 | 
			
		||||
      RealD ns = norm2(in);
 | 
			
		||||
      RealD nr = norm2(resid);
 | 
			
		||||
 | 
			
		||||
      std::cout << "SchurRedBlack solver true unprec resid "<< sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
 | 
			
		||||
    }     
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -4,6 +4,13 @@
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
    static const int CbRed  =0;
 | 
			
		||||
    static const int CbBlack=1;
 | 
			
		||||
    static const int Even   =CbRed;
 | 
			
		||||
    static const int Odd    =CbBlack;
 | 
			
		||||
    static const int DaggerNo=0;
 | 
			
		||||
    static const int DaggerYes=1;
 | 
			
		||||
 | 
			
		||||
// Specialise this for red black grids storing half the data like a chess board.
 | 
			
		||||
class GridRedBlackCartesian : public GridBase
 | 
			
		||||
{
 | 
			
		||||
@@ -44,6 +51,9 @@ public:
 | 
			
		||||
            return source_cb;
 | 
			
		||||
        }
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    GridRedBlackCartesian(GridBase *base) : GridRedBlackCartesian(base->_fdimensions,base->_simd_layout,base->_processors)  {};
 | 
			
		||||
 | 
			
		||||
    GridRedBlackCartesian(std::vector<int> &dimensions,
 | 
			
		||||
			  std::vector<int> &simd_layout,
 | 
			
		||||
			  std::vector<int> &processor_grid ) : GridBase(processor_grid)
 | 
			
		||||
 
 | 
			
		||||
@@ -91,6 +91,47 @@ inline void GridFromExpression( GridBase * &grid,const LatticeTrinaryExpression<
 | 
			
		||||
  GridFromExpression(grid,std::get<2>(expr.second));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Obtain the CB from an expression, ensuring conformable. This must follow a tree recursion
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////
 | 
			
		||||
template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr >
 | 
			
		||||
inline void CBFromExpression(int &cb,const T1& lat)   // Lattice leaf
 | 
			
		||||
{
 | 
			
		||||
  if ( (cb==Odd) || (cb==Even) ) {
 | 
			
		||||
    assert(cb==lat.checkerboard);
 | 
			
		||||
  } 
 | 
			
		||||
  cb=lat.checkerboard;
 | 
			
		||||
  //  std::cout<<"Lattice leaf cb "<<cb<<std::endl;
 | 
			
		||||
}
 | 
			
		||||
template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr >
 | 
			
		||||
inline void CBFromExpression(int &cb,const T1& notlat)   // non-lattice leaf
 | 
			
		||||
{
 | 
			
		||||
  //  std::cout<<"Non lattice leaf cb"<<cb<<std::endl;
 | 
			
		||||
}
 | 
			
		||||
template <typename Op, typename T1>
 | 
			
		||||
inline void CBFromExpression(int &cb,const LatticeUnaryExpression<Op,T1 > &expr)
 | 
			
		||||
{
 | 
			
		||||
  CBFromExpression(cb,std::get<0>(expr.second));// recurse 
 | 
			
		||||
  //  std::cout<<"Unary node cb "<<cb<<std::endl;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename Op, typename T1, typename T2>
 | 
			
		||||
inline void CBFromExpression(int &cb,const LatticeBinaryExpression<Op,T1,T2> &expr) 
 | 
			
		||||
{
 | 
			
		||||
  CBFromExpression(cb,std::get<0>(expr.second));// recurse
 | 
			
		||||
  CBFromExpression(cb,std::get<1>(expr.second));
 | 
			
		||||
  //  std::cout<<"Binary node cb "<<cb<<std::endl;
 | 
			
		||||
}
 | 
			
		||||
template <typename Op, typename T1, typename T2, typename T3>
 | 
			
		||||
inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) 
 | 
			
		||||
{
 | 
			
		||||
  CBFromExpression(cb,std::get<0>(expr.second));// recurse
 | 
			
		||||
  CBFromExpression(cb,std::get<1>(expr.second));
 | 
			
		||||
  CBFromExpression(cb,std::get<2>(expr.second));
 | 
			
		||||
  //  std::cout<<"Trinary node cb "<<cb<<std::endl;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////
 | 
			
		||||
// Unary operators and funcs
 | 
			
		||||
////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -9,45 +9,68 @@ namespace Grid {
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template<class obj1,class obj2,class obj3> strong_inline
 | 
			
		||||
    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
    conformable(lhs,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mult(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
      //      mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
#else
 | 
			
		||||
      mult(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template<class obj1,class obj2,class obj3> strong_inline
 | 
			
		||||
    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
    conformable(lhs,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mac(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
#else
 | 
			
		||||
      mac(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template<class obj1,class obj2,class obj3> strong_inline
 | 
			
		||||
    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
    conformable(lhs,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      sub(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
#else
 | 
			
		||||
      sub(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  template<class obj1,class obj2,class obj3> strong_inline
 | 
			
		||||
    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
    conformable(lhs,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      add(&tmp,&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
#else
 | 
			
		||||
      add(&ret._odata[ss],&lhs._odata[ss],&rhs._odata[ss]);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
@@ -56,6 +79,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template<class obj1,class obj2,class obj3> strong_inline
 | 
			
		||||
    void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(lhs,ret);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
@@ -67,7 +91,8 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  
 | 
			
		||||
  template<class obj1,class obj2,class obj3> strong_inline
 | 
			
		||||
    void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
    conformable(lhs,ret);
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,lhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
@@ -78,22 +103,32 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  
 | 
			
		||||
  template<class obj1,class obj2,class obj3> strong_inline
 | 
			
		||||
    void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
    conformable(lhs,ret);
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(ret,lhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      sub(&tmp,&lhs._odata[ss],&rhs);
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
#else 
 | 
			
		||||
      sub(&ret._odata[ss],&lhs._odata[ss],&rhs);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  template<class obj1,class obj2,class obj3> strong_inline
 | 
			
		||||
    void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
 | 
			
		||||
    ret.checkerboard = lhs.checkerboard;
 | 
			
		||||
    conformable(lhs,ret);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<lhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      add(&tmp,&lhs._odata[ss],&rhs);
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
#else 
 | 
			
		||||
      add(&ret._odata[ss],&lhs._odata[ss],&rhs);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
@@ -102,84 +137,112 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    template<class obj1,class obj2,class obj3> strong_inline
 | 
			
		||||
    void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mult(&tmp,&lhs,&rhs._odata[ss]);
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
#else 
 | 
			
		||||
      mult(&ret._odata[ss],&lhs,&rhs._odata[ss]);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template<class obj1,class obj2,class obj3> strong_inline
 | 
			
		||||
    void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      mac(&tmp,&lhs,&rhs._odata[ss]);
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
#else 
 | 
			
		||||
      mac(&ret._odata[ss],&lhs,&rhs._odata[ss]);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template<class obj1,class obj2,class obj3> strong_inline
 | 
			
		||||
    void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      sub(&tmp,&lhs,&rhs._odata[ss]);
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
#else 
 | 
			
		||||
      sub(&ret._odata[ss],&lhs,&rhs._odata[ss]);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  template<class obj1,class obj2,class obj3> strong_inline
 | 
			
		||||
    void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
 | 
			
		||||
    ret.checkerboard = rhs.checkerboard;
 | 
			
		||||
    conformable(ret,rhs);
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<rhs._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      obj1 tmp;
 | 
			
		||||
      add(&tmp,&lhs,&rhs._odata[ss]);
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
#else 
 | 
			
		||||
      add(&ret._odata[ss],&lhs,&rhs._odata[ss]);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template<class sobj,class vobj> strong_inline
 | 
			
		||||
  void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
 | 
			
		||||
    ret.checkerboard = x.checkerboard;
 | 
			
		||||
    conformable(ret,x);
 | 
			
		||||
    conformable(x,y);
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = a*x._odata[ss]+y._odata[ss];
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
#else
 | 
			
		||||
      ret._odata[ss]=a*x._odata[ss]+y._odata[ss];
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  template<class sobj,class vobj> strong_inline
 | 
			
		||||
  void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
 | 
			
		||||
    ret.checkerboard = x.checkerboard;
 | 
			
		||||
    conformable(ret,x);
 | 
			
		||||
    conformable(x,y);
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
#else
 | 
			
		||||
      ret._odata[ss]=a*x._odata[ss]+b*y._odata[ss];
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class sobj,class vobj> strong_inline
 | 
			
		||||
  RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
 | 
			
		||||
    ret.checkerboard = x.checkerboard;
 | 
			
		||||
    conformable(ret,x);
 | 
			
		||||
    conformable(x,y);
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
    for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
      vobj tmp = a*x._odata[ss]+y._odata[ss];
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
    }
 | 
			
		||||
    axpy(ret,a,x,y);
 | 
			
		||||
    return norm2(ret);
 | 
			
		||||
  }
 | 
			
		||||
  template<class sobj,class vobj> strong_inline
 | 
			
		||||
  RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
 | 
			
		||||
    ret.checkerboard = x.checkerboard;
 | 
			
		||||
    conformable(ret,x);
 | 
			
		||||
    conformable(x,y);
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
    for(int ss=0;ss<x._grid->oSites();ss++){
 | 
			
		||||
      vobj tmp = a*x._odata[ss]+b*y._odata[ss];
 | 
			
		||||
      vstream(ret._odata[ss],tmp);
 | 
			
		||||
    }
 | 
			
		||||
    axpby(ret,a,b,x,y);
 | 
			
		||||
    return norm2(ret); // FIXME implement parallel norm in ss loop
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,8 @@
 | 
			
		||||
#ifndef GRID_LATTICE_BASE_H
 | 
			
		||||
#define GRID_LATTICE_BASE_H
 | 
			
		||||
 | 
			
		||||
#define STREAMING_STORES
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
// TODO: 
 | 
			
		||||
@@ -64,60 +66,136 @@ public:
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  template <typename Op, typename T1>                         strong_inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
 | 
			
		||||
  {
 | 
			
		||||
    GridBase *egrid(nullptr);
 | 
			
		||||
    GridFromExpression(egrid,expr);
 | 
			
		||||
    assert(egrid!=nullptr);
 | 
			
		||||
    conformable(_grid,egrid);
 | 
			
		||||
 | 
			
		||||
    int cb=-1;
 | 
			
		||||
    CBFromExpression(cb,expr);
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
      vobj tmp= eval(ss,expr);
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,tmp);
 | 
			
		||||
#else
 | 
			
		||||
      _odata[ss]=eval(ss,expr);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  template <typename Op, typename T1,typename T2>             strong_inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
 | 
			
		||||
  {
 | 
			
		||||
    GridBase *egrid(nullptr);
 | 
			
		||||
    GridFromExpression(egrid,expr);
 | 
			
		||||
    assert(egrid!=nullptr);
 | 
			
		||||
    conformable(_grid,egrid);
 | 
			
		||||
 | 
			
		||||
    int cb=-1;
 | 
			
		||||
    CBFromExpression(cb,expr);
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
      vobj tmp= eval(ss,expr);
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,tmp);
 | 
			
		||||
#else
 | 
			
		||||
      _odata[ss]=eval(ss,expr);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  template <typename Op, typename T1,typename T2,typename T3> strong_inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
 | 
			
		||||
  {
 | 
			
		||||
    GridBase *egrid(nullptr);
 | 
			
		||||
    GridFromExpression(egrid,expr);
 | 
			
		||||
    assert(egrid!=nullptr);
 | 
			
		||||
    conformable(_grid,egrid);
 | 
			
		||||
 | 
			
		||||
    int cb=-1;
 | 
			
		||||
    CBFromExpression(cb,expr);
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
      vobj tmp= eval(ss,expr);
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,tmp);
 | 
			
		||||
#else
 | 
			
		||||
      _odata[ss] = eval(ss,expr);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  //GridFromExpression is tricky to do
 | 
			
		||||
  template<class Op,class T1>
 | 
			
		||||
    Lattice(const LatticeUnaryExpression<Op,T1> & expr):    _grid(nullptr){
 | 
			
		||||
 | 
			
		||||
    GridFromExpression(_grid,expr);
 | 
			
		||||
    assert(_grid!=nullptr);
 | 
			
		||||
 | 
			
		||||
    int cb=-1;
 | 
			
		||||
    CBFromExpression(cb,expr);
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
    _odata.resize(_grid->oSites());
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
      _odata[ss] = eval(ss,expr);
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,tmp);
 | 
			
		||||
#else
 | 
			
		||||
      _odata[ss]=_eval(ss,expr);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class Op,class T1, class T2>
 | 
			
		||||
  Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr):    _grid(nullptr){
 | 
			
		||||
    GridFromExpression(_grid,expr);
 | 
			
		||||
    assert(_grid!=nullptr);
 | 
			
		||||
 | 
			
		||||
    int cb=-1;
 | 
			
		||||
    CBFromExpression(cb,expr);
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
    _odata.resize(_grid->oSites());
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
      _odata[ss] = eval(ss,expr);
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = eval(tmp,ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,tmp);
 | 
			
		||||
#else
 | 
			
		||||
      _odata[ss]=eval(ss,expr);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
  template<class Op,class T1, class T2, class T3>
 | 
			
		||||
  Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr):    _grid(nullptr){
 | 
			
		||||
    GridFromExpression(_grid,expr);
 | 
			
		||||
    assert(_grid!=nullptr);
 | 
			
		||||
 | 
			
		||||
    int cb=-1;
 | 
			
		||||
    CBFromExpression(cb,expr);
 | 
			
		||||
    assert( (cb==Odd) || (cb==Even));
 | 
			
		||||
    checkerboard=cb;
 | 
			
		||||
 | 
			
		||||
    _odata.resize(_grid->oSites());
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int ss=0;ss<_grid->oSites();ss++){
 | 
			
		||||
      _odata[ss] = eval(ss,expr);
 | 
			
		||||
#ifdef STREAMING_STORES
 | 
			
		||||
      vobj tmp = eval(ss,expr);
 | 
			
		||||
      vstream(_odata[ss] ,tmp);
 | 
			
		||||
#else
 | 
			
		||||
      _odata[ss]=eval(ss,expr);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
@@ -140,6 +218,7 @@ PARALLEL_FOR_LOOP
 | 
			
		||||
        return *this;
 | 
			
		||||
    }
 | 
			
		||||
    template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
 | 
			
		||||
      this->checkerboard = r.checkerboard;
 | 
			
		||||
      conformable(*this,r);
 | 
			
		||||
      std::cout<<"Lattice operator ="<<std::endl;
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
 
 | 
			
		||||
@@ -16,19 +16,18 @@ strong_inline void mult(iScalar<rtype> * __restrict__ ret,const iScalar<mtype> *
 | 
			
		||||
template<class rrtype,class ltype,class rtype,int N>
 | 
			
		||||
strong_inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
 | 
			
		||||
  for(int c1=0;c1<N;c1++){
 | 
			
		||||
    int c3=0;
 | 
			
		||||
    for(int c2=0;c2<N;c2++){
 | 
			
		||||
      mult(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
 | 
			
		||||
      mult(&ret->_internal[c1][c2],&lhs->_internal[c1][0],&rhs->_internal[0][c2]);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  for(int c3=1;c3<N;c3++){
 | 
			
		||||
    for(int c1=0;c1<N;c1++){
 | 
			
		||||
  for(int c1=0;c1<N;c1++){
 | 
			
		||||
    for(int c3=1;c3<N;c3++){
 | 
			
		||||
      for(int c2=0;c2<N;c2++){
 | 
			
		||||
	mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
    return;
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class rrtype,class ltype,class rtype,int N>
 | 
			
		||||
 
 | 
			
		||||
@@ -34,66 +34,64 @@ public:
 | 
			
		||||
 | 
			
		||||
  // Scalar no action
 | 
			
		||||
  //  template<int Level> using tensor_reduce_level = typename iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
 | 
			
		||||
  iScalar()=default;
 | 
			
		||||
  iScalar() = default;
 | 
			
		||||
  /*
 | 
			
		||||
  iScalar(const iScalar<vtype> ©me)=default;
 | 
			
		||||
  iScalar(iScalar<vtype> &©me)=default;
 | 
			
		||||
  iScalar<vtype> & operator= (const iScalar<vtype> ©me) = default;
 | 
			
		||||
  iScalar<vtype> & operator= (iScalar<vtype> &©me) = default;
 | 
			
		||||
  */
 | 
			
		||||
  iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
 | 
			
		||||
  iScalar(const Zero &z){ *this = zero; };
 | 
			
		||||
 | 
			
		||||
  iScalar<vtype> & operator= (const Zero &hero){
 | 
			
		||||
    zeroit(*this);
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  friend strong_inline void vstream(iScalar<vtype> &out,const iScalar<vtype> &in){
 | 
			
		||||
    vstream(out._internal,in._internal);
 | 
			
		||||
  }
 | 
			
		||||
  friend strong_inline void zeroit(iScalar<vtype> &that){
 | 
			
		||||
    zeroit(that._internal);
 | 
			
		||||
  }
 | 
			
		||||
  friend strong_inline void prefetch(iScalar<vtype> &that){
 | 
			
		||||
    prefetch(that._internal);
 | 
			
		||||
  }
 | 
			
		||||
  friend strong_inline void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
 | 
			
		||||
    permute(out._internal,in._internal,permutetype);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Unary negation
 | 
			
		||||
  friend strong_inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
 | 
			
		||||
    iScalar<vtype> ret;
 | 
			
		||||
    ret._internal= -r._internal;
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
 | 
			
		||||
  strong_inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
 | 
			
		||||
    *this = (*this)*r;
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  strong_inline iScalar<vtype> &operator -=(const iScalar<vtype> &r) {
 | 
			
		||||
    *this = (*this)-r;
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  strong_inline iScalar<vtype> &operator +=(const iScalar<vtype> &r) {
 | 
			
		||||
    *this = (*this)+r;
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  strong_inline vtype & operator ()(void) {
 | 
			
		||||
    return _internal;
 | 
			
		||||
  }
 | 
			
		||||
  strong_inline const vtype & operator ()(void) const {
 | 
			
		||||
    return _internal;
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  operator ComplexD () const { return(TensorRemove(_internal)); };
 | 
			
		||||
  operator RealD () const { return(real(TensorRemove(_internal))); }
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
    iScalar<vtype> & operator= (const Zero &hero){
 | 
			
		||||
      zeroit(*this);
 | 
			
		||||
      return *this;
 | 
			
		||||
    }
 | 
			
		||||
    friend strong_inline void vstream(iScalar<vtype> &out,const iScalar<vtype> &in){
 | 
			
		||||
      vstream(out._internal,in._internal);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    friend strong_inline void zeroit(iScalar<vtype> &that){
 | 
			
		||||
        zeroit(that._internal);
 | 
			
		||||
    }
 | 
			
		||||
    friend strong_inline void prefetch(iScalar<vtype> &that){
 | 
			
		||||
      prefetch(that._internal);
 | 
			
		||||
    }
 | 
			
		||||
    friend strong_inline void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
 | 
			
		||||
      permute(out._internal,in._internal,permutetype);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Unary negation
 | 
			
		||||
    friend strong_inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
 | 
			
		||||
        iScalar<vtype> ret;
 | 
			
		||||
        ret._internal= -r._internal;
 | 
			
		||||
        return ret;
 | 
			
		||||
    }
 | 
			
		||||
    // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
 | 
			
		||||
    strong_inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
 | 
			
		||||
        *this = (*this)*r;
 | 
			
		||||
        return *this;
 | 
			
		||||
    }
 | 
			
		||||
    strong_inline iScalar<vtype> &operator -=(const iScalar<vtype> &r) {
 | 
			
		||||
        *this = (*this)-r;
 | 
			
		||||
        return *this;
 | 
			
		||||
    }
 | 
			
		||||
    strong_inline iScalar<vtype> &operator +=(const iScalar<vtype> &r) {
 | 
			
		||||
        *this = (*this)+r;
 | 
			
		||||
        return *this;
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    strong_inline vtype & operator ()(void) {
 | 
			
		||||
      return _internal;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    strong_inline const vtype & operator ()(void) const {
 | 
			
		||||
      return _internal;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    operator ComplexD () const { return(TensorRemove(_internal)); };
 | 
			
		||||
    operator RealD () const { return(real(TensorRemove(_internal))); }
 | 
			
		||||
 | 
			
		||||
    // convert from a something to a scalar
 | 
			
		||||
    template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iScalar<vtype>
 | 
			
		||||
  // convert from a something to a scalar
 | 
			
		||||
  template<class T,typename std::enable_if<!isGridTensor<T>::value, T>::type* = nullptr > strong_inline auto operator = (T arg) -> iScalar<vtype>
 | 
			
		||||
    { 
 | 
			
		||||
      _internal = vtype(arg);
 | 
			
		||||
      return *this;
 | 
			
		||||
@@ -129,67 +127,72 @@ public:
 | 
			
		||||
  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
 | 
			
		||||
  iVector(const Zero &z){ *this = zero; };
 | 
			
		||||
  iVector() =default;
 | 
			
		||||
  /*
 | 
			
		||||
  iVector(const iVector<vtype,N> ©me)=default;
 | 
			
		||||
  iVector(iVector<vtype,N> &©me)=default;
 | 
			
		||||
  iVector<vtype,N> & operator= (const iVector<vtype,N> ©me) = default;
 | 
			
		||||
  iVector<vtype,N> & operator= (iVector<vtype,N> &©me) = default;
 | 
			
		||||
  */
 | 
			
		||||
 | 
			
		||||
    iVector<vtype,N> & operator= (const Zero &hero){
 | 
			
		||||
        zeroit(*this);
 | 
			
		||||
        return *this;
 | 
			
		||||
  iVector<vtype,N> & operator= (const Zero &hero){
 | 
			
		||||
    zeroit(*this);
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  friend strong_inline void zeroit(iVector<vtype,N> &that){
 | 
			
		||||
    for(int i=0;i<N;i++){
 | 
			
		||||
      zeroit(that._internal[i]);
 | 
			
		||||
    }
 | 
			
		||||
    friend strong_inline void zeroit(iVector<vtype,N> &that){
 | 
			
		||||
        for(int i=0;i<N;i++){
 | 
			
		||||
            zeroit(that._internal[i]);
 | 
			
		||||
        }
 | 
			
		||||
  }
 | 
			
		||||
  friend strong_inline void prefetch(iVector<vtype,N> &that){
 | 
			
		||||
    for(int i=0;i<N;i++) prefetch(that._internal[i]);
 | 
			
		||||
  }
 | 
			
		||||
  friend strong_inline void vstream(iVector<vtype,N> &out,const iVector<vtype,N> &in){
 | 
			
		||||
    for(int i=0;i<N;i++){
 | 
			
		||||
      vstream(out._internal[i],in._internal[i]);
 | 
			
		||||
    }
 | 
			
		||||
    friend strong_inline void prefetch(iVector<vtype,N> &that){
 | 
			
		||||
      for(int i=0;i<N;i++) prefetch(that._internal[i]);
 | 
			
		||||
  }
 | 
			
		||||
  friend strong_inline void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
 | 
			
		||||
    for(int i=0;i<N;i++){
 | 
			
		||||
      permute(out._internal[i],in._internal[i],permutetype);
 | 
			
		||||
    }
 | 
			
		||||
    friend strong_inline void vstream(iVector<vtype,N> &out,const iVector<vtype,N> &in){
 | 
			
		||||
      for(int i=0;i<N;i++){
 | 
			
		||||
	vstream(out._internal[i],in._internal[i]);
 | 
			
		||||
      }
 | 
			
		||||
  }
 | 
			
		||||
  // Unary negation
 | 
			
		||||
  friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
 | 
			
		||||
    iVector<vtype,N> ret;
 | 
			
		||||
    for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
 | 
			
		||||
    return ret;
 | 
			
		||||
  }
 | 
			
		||||
  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
 | 
			
		||||
  strong_inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
 | 
			
		||||
    *this = (*this)*r;
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  strong_inline iVector<vtype,N> &operator -=(const iVector<vtype,N> &r) {
 | 
			
		||||
    *this = (*this)-r;
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  strong_inline iVector<vtype,N> &operator +=(const iVector<vtype,N> &r) {
 | 
			
		||||
    *this = (*this)+r;
 | 
			
		||||
    return *this;
 | 
			
		||||
  }
 | 
			
		||||
  strong_inline vtype & operator ()(int i) {
 | 
			
		||||
    return _internal[i];
 | 
			
		||||
  }
 | 
			
		||||
  strong_inline const vtype & operator ()(int i) const {
 | 
			
		||||
    return _internal[i];
 | 
			
		||||
  }
 | 
			
		||||
  friend std::ostream& operator<< (std::ostream& stream, const iVector<vtype,N> &o){
 | 
			
		||||
    stream<< "V<"<<N<<">{";
 | 
			
		||||
    for(int i=0;i<N;i++) {
 | 
			
		||||
      stream<<o._internal[i];
 | 
			
		||||
      if (i<N-1)	stream<<",";
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    friend strong_inline void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
 | 
			
		||||
      for(int i=0;i<N;i++){
 | 
			
		||||
	permute(out._internal[i],in._internal[i],permutetype);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    // Unary negation
 | 
			
		||||
    friend strong_inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
 | 
			
		||||
        iVector<vtype,N> ret;
 | 
			
		||||
        for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
 | 
			
		||||
        return ret;
 | 
			
		||||
    }
 | 
			
		||||
    // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
 | 
			
		||||
    strong_inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
 | 
			
		||||
        *this = (*this)*r;
 | 
			
		||||
        return *this;
 | 
			
		||||
    }
 | 
			
		||||
    strong_inline iVector<vtype,N> &operator -=(const iVector<vtype,N> &r) {
 | 
			
		||||
        *this = (*this)-r;
 | 
			
		||||
        return *this;
 | 
			
		||||
    }
 | 
			
		||||
    strong_inline iVector<vtype,N> &operator +=(const iVector<vtype,N> &r) {
 | 
			
		||||
        *this = (*this)+r;
 | 
			
		||||
        return *this;
 | 
			
		||||
    }
 | 
			
		||||
    strong_inline vtype & operator ()(int i) {
 | 
			
		||||
      return _internal[i];
 | 
			
		||||
    }
 | 
			
		||||
    strong_inline const vtype & operator ()(int i) const {
 | 
			
		||||
      return _internal[i];
 | 
			
		||||
    }
 | 
			
		||||
    friend std::ostream& operator<< (std::ostream& stream, const iVector<vtype,N> &o){
 | 
			
		||||
      stream<< "V<"<<N<<">{";
 | 
			
		||||
      for(int i=0;i<N;i++) {
 | 
			
		||||
	stream<<o._internal[i];
 | 
			
		||||
	if (i<N-1)	stream<<",";
 | 
			
		||||
      }
 | 
			
		||||
      stream<<"}";
 | 
			
		||||
      return stream;
 | 
			
		||||
    };
 | 
			
		||||
    //    strong_inline vtype && operator ()(int i) {
 | 
			
		||||
    //      return _internal[i];
 | 
			
		||||
    //    }
 | 
			
		||||
    stream<<"}";
 | 
			
		||||
    return stream;
 | 
			
		||||
  };
 | 
			
		||||
  //    strong_inline vtype && operator ()(int i) {
 | 
			
		||||
  //      return _internal[i];
 | 
			
		||||
  //    }
 | 
			
		||||
};
 | 
			
		||||
    
 | 
			
		||||
template<class vtype,int N> class iMatrix 
 | 
			
		||||
@@ -210,8 +213,6 @@ public:
 | 
			
		||||
  iMatrix(const Zero &z){ *this = zero; };
 | 
			
		||||
  iMatrix() =default;
 | 
			
		||||
  
 | 
			
		||||
  // No copy constructor...
 | 
			
		||||
  
 | 
			
		||||
  iMatrix& operator=(const iMatrix& rhs){
 | 
			
		||||
    for(int i=0;i<N;i++)
 | 
			
		||||
      for(int j=0;j<N;j++)
 | 
			
		||||
@@ -221,6 +222,17 @@ public:
 | 
			
		||||
  
 | 
			
		||||
 
 | 
			
		||||
 | 
			
		||||
  iMatrix(scalar_type s)  { (*this) = s ;};// recurse down and hit the constructor for vector_type
 | 
			
		||||
 | 
			
		||||
  /*
 | 
			
		||||
  iMatrix(const iMatrix<vtype,N> ©me)=default;
 | 
			
		||||
  iMatrix(iMatrix<vtype,N> &©me)=default;
 | 
			
		||||
  iMatrix<vtype,N> & operator= (const iMatrix<vtype,N> ©me) = default;
 | 
			
		||||
  iMatrix<vtype,N> & operator= (iMatrix<vtype,N> &©me) = default;
 | 
			
		||||
  */
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  iMatrix<vtype,N> & operator= (const Zero &hero){
 | 
			
		||||
    zeroit(*this);
 | 
			
		||||
    return *this;
 | 
			
		||||
 
 | 
			
		||||
@@ -10,9 +10,6 @@ namespace QCD {
 | 
			
		||||
    static const int Nhs=2; // half spinor
 | 
			
		||||
    static const int Nds=8; // double stored gauge field
 | 
			
		||||
 | 
			
		||||
    static const int CbRed  =0;
 | 
			
		||||
    static const int CbBlack=1;
 | 
			
		||||
    
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // QCD iMatrix types
 | 
			
		||||
    // Index conventions:                            Lorentz x Spin x Colour
 | 
			
		||||
 
 | 
			
		||||
@@ -4,8 +4,8 @@
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
const std::vector<int> WilsonMatrix::directions   ({0,1,2,3, 0, 1, 2, 3,0});
 | 
			
		||||
const std::vector<int> WilsonMatrix::displacements({1,1,1,1,-1,-1,-1,-1,0});
 | 
			
		||||
const std::vector<int> WilsonMatrix::directions   ({0,1,2,3, 0, 1, 2, 3});
 | 
			
		||||
const std::vector<int> WilsonMatrix::displacements({1,1,1,1,-1,-1,-1,-1});
 | 
			
		||||
 | 
			
		||||
  // Should be in header?
 | 
			
		||||
const int WilsonMatrix::Xp = 0;
 | 
			
		||||
@@ -16,7 +16,6 @@ const int WilsonMatrix::Xm = 4;
 | 
			
		||||
const int WilsonMatrix::Ym = 5;
 | 
			
		||||
const int WilsonMatrix::Zm = 6;
 | 
			
		||||
const int WilsonMatrix::Tm = 7;
 | 
			
		||||
  //const int WilsonMatrix::X0 = 8;
 | 
			
		||||
 | 
			
		||||
  class WilsonCompressor {
 | 
			
		||||
  public:
 | 
			
		||||
@@ -72,20 +71,27 @@ const int WilsonMatrix::Tm = 7;
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  WilsonMatrix::WilsonMatrix(LatticeGaugeField &_Umu,double _mass)
 | 
			
		||||
  : Stencil(Umu._grid,npoint,0,directions,displacements),
 | 
			
		||||
  WilsonMatrix::WilsonMatrix(LatticeGaugeField &_Umu,GridCartesian &Fgrid,GridRedBlackCartesian &Hgrid, double _mass)  : 
 | 
			
		||||
    CheckerBoardedSparseMatrixBase<LatticeFermion>(&Fgrid,&Hgrid),
 | 
			
		||||
    Stencil    (  _grid,npoint,Even,directions,displacements),
 | 
			
		||||
    StencilEven(_cbgrid,npoint,Even,directions,displacements), // source is Even
 | 
			
		||||
    StencilOdd (_cbgrid,npoint,Odd ,directions,displacements), // source is Odd
 | 
			
		||||
    mass(_mass),
 | 
			
		||||
    Umu(_Umu._grid)
 | 
			
		||||
{
 | 
			
		||||
  // Allocate the required comms buffer
 | 
			
		||||
  grid = _Umu._grid;
 | 
			
		||||
  comm_buf.resize(Stencil._unified_buffer_size);
 | 
			
		||||
  DoubleStore(Umu,_Umu);
 | 
			
		||||
}
 | 
			
		||||
    Umu(_grid),
 | 
			
		||||
    UmuEven(_cbgrid),
 | 
			
		||||
    UmuOdd (_cbgrid)
 | 
			
		||||
  {
 | 
			
		||||
    // Allocate the required comms buffer
 | 
			
		||||
    comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
 | 
			
		||||
    
 | 
			
		||||
    DoubleStore(Umu,_Umu);
 | 
			
		||||
    pickCheckerboard(Even,UmuEven,Umu);
 | 
			
		||||
    pickCheckerboard(Odd ,UmuOdd,Umu);
 | 
			
		||||
  }
 | 
			
		||||
      
 | 
			
		||||
void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu)
 | 
			
		||||
{
 | 
			
		||||
  LatticeColourMatrix U(grid);
 | 
			
		||||
  LatticeColourMatrix U(_grid);
 | 
			
		||||
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
    U = peekIndex<LorentzIndex>(Umu,mu);
 | 
			
		||||
@@ -97,48 +103,63 @@ void WilsonMatrix::DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeF
 | 
			
		||||
 | 
			
		||||
RealD WilsonMatrix::M(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  Dhop(in,out,0);
 | 
			
		||||
  out.checkerboard=in.checkerboard;
 | 
			
		||||
  Dhop(in,out,DaggerNo);
 | 
			
		||||
  out = (4+mass)*in - 0.5*out  ; // FIXME : axpby_norm! fusion fun
 | 
			
		||||
  return norm2(out);
 | 
			
		||||
}
 | 
			
		||||
RealD WilsonMatrix::Mdag(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  Dhop(in,out,1);
 | 
			
		||||
  out.checkerboard=in.checkerboard;
 | 
			
		||||
  Dhop(in,out,DaggerYes);
 | 
			
		||||
  out = (4+mass)*in - 0.5*out  ; // FIXME : axpby_norm! fusion fun
 | 
			
		||||
  return norm2(out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void WilsonMatrix::Meooe(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  Dhop(in,out,0);
 | 
			
		||||
  out = 0.5*out; // FIXME : scale factor in Dhop
 | 
			
		||||
  if ( in.checkerboard == Odd ) {
 | 
			
		||||
    DhopEO(in,out,DaggerNo);
 | 
			
		||||
  } else {
 | 
			
		||||
    DhopOE(in,out,DaggerNo);
 | 
			
		||||
  }
 | 
			
		||||
  out = (-0.5)*out; // FIXME : scale factor in Dhop
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::MeooeDag(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  Dhop(in,out,1);
 | 
			
		||||
  if ( in.checkerboard == Odd ) {
 | 
			
		||||
    DhopEO(in,out,DaggerYes);
 | 
			
		||||
  } else {
 | 
			
		||||
    DhopOE(in,out,DaggerYes);
 | 
			
		||||
  }
 | 
			
		||||
  out = (-0.5)*out; // FIXME : scale factor in Dhop
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::Mooee(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  out.checkerboard = in.checkerboard;
 | 
			
		||||
  out = (4.0+mass)*in;
 | 
			
		||||
  return ;
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  out = (1.0/(4.0+mass))*in;
 | 
			
		||||
  return ;
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::MooeeDag(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  out.checkerboard = in.checkerboard;
 | 
			
		||||
  Mooee(in,out);
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::MooeeInv(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  out.checkerboard = in.checkerboard;
 | 
			
		||||
  out = (1.0/(4.0+mass))*in;
 | 
			
		||||
  return ;
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
  out = (1.0/(4.0+mass))*in;
 | 
			
		||||
  return ;
 | 
			
		||||
  out.checkerboard = in.checkerboard;
 | 
			
		||||
  MooeeInv(in,out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
void WilsonMatrix::DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
 | 
			
		||||
			    std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 | 
			
		||||
			    int ss,const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
    vHalfSpinColourVector  tmp;    
 | 
			
		||||
    vHalfSpinColourVector  chi;    
 | 
			
		||||
@@ -146,79 +167,78 @@ void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out
 | 
			
		||||
    vHalfSpinColourVector Uchi;
 | 
			
		||||
    int offset,local,perm, ptype;
 | 
			
		||||
 | 
			
		||||
    //    int ss = Stencil._LebesgueReorder[sss];
 | 
			
		||||
    int ssu= ss;
 | 
			
		||||
    //#define VERBOSE( A)  if ( ss<10 ) { std::cout << "site " <<ss << " " #A " neigh " << offset << " perm "<< perm <<std::endl;}    
 | 
			
		||||
 | 
			
		||||
    // Xp
 | 
			
		||||
    offset = Stencil._offsets [Xp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Xp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Xp][ss];
 | 
			
		||||
    offset = st._offsets [Xp][ss];
 | 
			
		||||
    local  = st._is_local[Xp][ss];
 | 
			
		||||
    perm   = st._permute[Xp][ss];
 | 
			
		||||
 | 
			
		||||
    ptype  = Stencil._permute_type[Xp];
 | 
			
		||||
    ptype  = st._permute_type[Xp];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjXp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjXp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Xp),&chi());
 | 
			
		||||
    spReconXp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Yp
 | 
			
		||||
    offset = Stencil._offsets [Yp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Yp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Yp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Yp];
 | 
			
		||||
    offset = st._offsets [Yp][ss];
 | 
			
		||||
    local  = st._is_local[Yp][ss];
 | 
			
		||||
    perm   = st._permute[Yp][ss];
 | 
			
		||||
    ptype  = st._permute_type[Yp];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjYp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjYp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Yp),&chi());
 | 
			
		||||
    accumReconYp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Zp
 | 
			
		||||
    offset = Stencil._offsets [Zp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Zp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Zp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Zp];
 | 
			
		||||
    offset = st._offsets [Zp][ss];
 | 
			
		||||
    local  = st._is_local[Zp][ss];
 | 
			
		||||
    perm   = st._permute[Zp][ss];
 | 
			
		||||
    ptype  = st._permute_type[Zp];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjZp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjZp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Zp),&chi());
 | 
			
		||||
    accumReconZp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Tp
 | 
			
		||||
    offset = Stencil._offsets [Tp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Tp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Tp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Tp];
 | 
			
		||||
    offset = st._offsets [Tp][ss];
 | 
			
		||||
    local  = st._is_local[Tp][ss];
 | 
			
		||||
    perm   = st._permute[Tp][ss];
 | 
			
		||||
    ptype  = st._permute_type[Tp];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjTp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjTp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Tp),&chi());
 | 
			
		||||
    accumReconTp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Xm
 | 
			
		||||
    offset = Stencil._offsets [Xm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Xm][ss];
 | 
			
		||||
    perm   = Stencil._permute[Xm][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Xm];
 | 
			
		||||
    offset = st._offsets [Xm][ss];
 | 
			
		||||
    local  = st._is_local[Xm][ss];
 | 
			
		||||
    perm   = st._permute[Xm][ss];
 | 
			
		||||
    ptype  = st._permute_type[Xm];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) 
 | 
			
		||||
    {
 | 
			
		||||
@@ -227,17 +247,17 @@ void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjXm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Xm),&chi());
 | 
			
		||||
    accumReconXm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    // Ym
 | 
			
		||||
    offset = Stencil._offsets [Ym][ss];
 | 
			
		||||
    local  = Stencil._is_local[Ym][ss];
 | 
			
		||||
    perm   = Stencil._permute[Ym][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Ym];
 | 
			
		||||
    offset = st._offsets [Ym][ss];
 | 
			
		||||
    local  = st._is_local[Ym][ss];
 | 
			
		||||
    perm   = st._permute[Ym][ss];
 | 
			
		||||
    ptype  = st._permute_type[Ym];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjYm(tmp,in._odata[offset]);
 | 
			
		||||
@@ -245,46 +265,49 @@ void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjYm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Ym),&chi());
 | 
			
		||||
    accumReconYm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Zm
 | 
			
		||||
    offset = Stencil._offsets [Zm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Zm][ss];
 | 
			
		||||
    perm   = Stencil._permute[Zm][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Zm];
 | 
			
		||||
    offset = st._offsets [Zm][ss];
 | 
			
		||||
    local  = st._is_local[Zm][ss];
 | 
			
		||||
    perm   = st._permute[Zm][ss];
 | 
			
		||||
    ptype  = st._permute_type[Zm];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjZm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjZm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Zm),&chi());
 | 
			
		||||
    accumReconZm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Tm
 | 
			
		||||
    offset = Stencil._offsets [Tm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Tm][ss];
 | 
			
		||||
    perm   = Stencil._permute[Tm][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Tm];
 | 
			
		||||
    offset = st._offsets [Tm][ss];
 | 
			
		||||
    local  = st._is_local[Tm][ss];
 | 
			
		||||
    perm   = st._permute[Tm][ss];
 | 
			
		||||
    ptype  = st._permute_type[Tm];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjTm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjTm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Tm),&chi());
 | 
			
		||||
    accumReconTm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    vstream(out._odata[ss],result);
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
 | 
			
		||||
void WilsonMatrix::DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
 | 
			
		||||
			       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 | 
			
		||||
			       int ss,const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
{
 | 
			
		||||
    vHalfSpinColourVector  tmp;    
 | 
			
		||||
    vHalfSpinColourVector  chi;    
 | 
			
		||||
@@ -292,78 +315,76 @@ void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &
 | 
			
		||||
    vHalfSpinColourVector Uchi;
 | 
			
		||||
    int offset,local,perm, ptype;
 | 
			
		||||
 | 
			
		||||
    int ssu= ss;
 | 
			
		||||
 | 
			
		||||
    // Xp
 | 
			
		||||
    offset = Stencil._offsets [Xm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Xm][ss];
 | 
			
		||||
    perm   = Stencil._permute[Xm][ss];
 | 
			
		||||
    offset = st._offsets [Xm][ss];
 | 
			
		||||
    local  = st._is_local[Xm][ss];
 | 
			
		||||
    perm   = st._permute[Xm][ss];
 | 
			
		||||
 | 
			
		||||
    ptype  = Stencil._permute_type[Xm];
 | 
			
		||||
    ptype  = st._permute_type[Xm];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjXp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjXp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Xm),&chi());
 | 
			
		||||
    spReconXp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Yp
 | 
			
		||||
    offset = Stencil._offsets [Ym][ss];
 | 
			
		||||
    local  = Stencil._is_local[Ym][ss];
 | 
			
		||||
    perm   = Stencil._permute[Ym][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Ym];
 | 
			
		||||
    offset = st._offsets [Ym][ss];
 | 
			
		||||
    local  = st._is_local[Ym][ss];
 | 
			
		||||
    perm   = st._permute[Ym][ss];
 | 
			
		||||
    ptype  = st._permute_type[Ym];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjYp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjYp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Ym),&chi());
 | 
			
		||||
    accumReconYp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Zp
 | 
			
		||||
    offset = Stencil._offsets [Zm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Zm][ss];
 | 
			
		||||
    perm   = Stencil._permute[Zm][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Zm];
 | 
			
		||||
    offset = st._offsets [Zm][ss];
 | 
			
		||||
    local  = st._is_local[Zm][ss];
 | 
			
		||||
    perm   = st._permute[Zm][ss];
 | 
			
		||||
    ptype  = st._permute_type[Zm];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjZp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjZp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Zm),&chi());
 | 
			
		||||
    accumReconZp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Tp
 | 
			
		||||
    offset = Stencil._offsets [Tm][ss];
 | 
			
		||||
    local  = Stencil._is_local[Tm][ss];
 | 
			
		||||
    perm   = Stencil._permute[Tm][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Tm];
 | 
			
		||||
    offset = st._offsets [Tm][ss];
 | 
			
		||||
    local  = st._is_local[Tm][ss];
 | 
			
		||||
    perm   = st._permute[Tm][ss];
 | 
			
		||||
    ptype  = st._permute_type[Tm];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjTp(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjTp(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Tm),&chi());
 | 
			
		||||
    accumReconTp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Xm
 | 
			
		||||
    offset = Stencil._offsets [Xp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Xp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Xp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Xp];
 | 
			
		||||
    offset = st._offsets [Xp][ss];
 | 
			
		||||
    local  = st._is_local[Xp][ss];
 | 
			
		||||
    perm   = st._permute[Xp][ss];
 | 
			
		||||
    ptype  = st._permute_type[Xp];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) 
 | 
			
		||||
    {
 | 
			
		||||
@@ -372,17 +393,16 @@ void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjXm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Xp),&chi());
 | 
			
		||||
    accumReconXm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    // Ym
 | 
			
		||||
    offset = Stencil._offsets [Yp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Yp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Yp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Yp];
 | 
			
		||||
    offset = st._offsets [Yp][ss];
 | 
			
		||||
    local  = st._is_local[Yp][ss];
 | 
			
		||||
    perm   = st._permute[Yp][ss];
 | 
			
		||||
    ptype  = st._permute_type[Yp];
 | 
			
		||||
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjYm(tmp,in._odata[offset]);
 | 
			
		||||
@@ -390,69 +410,97 @@ void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjYm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Yp),&chi());
 | 
			
		||||
    accumReconYm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Zm
 | 
			
		||||
    offset = Stencil._offsets [Zp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Zp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Zp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Zp];
 | 
			
		||||
    offset = st._offsets [Zp][ss];
 | 
			
		||||
    local  = st._is_local[Zp][ss];
 | 
			
		||||
    perm   = st._permute[Zp][ss];
 | 
			
		||||
    ptype  = st._permute_type[Zp];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjZm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjZm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Zp),&chi());
 | 
			
		||||
    accumReconZm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Tm
 | 
			
		||||
    offset = Stencil._offsets [Tp][ss];
 | 
			
		||||
    local  = Stencil._is_local[Tp][ss];
 | 
			
		||||
    perm   = Stencil._permute[Tp][ss];
 | 
			
		||||
    ptype  = Stencil._permute_type[Tp];
 | 
			
		||||
    offset = st._offsets [Tp][ss];
 | 
			
		||||
    local  = st._is_local[Tp][ss];
 | 
			
		||||
    perm   = st._permute[Tp][ss];
 | 
			
		||||
    ptype  = st._permute_type[Tp];
 | 
			
		||||
    if ( local && perm ) {
 | 
			
		||||
      spProjTm(tmp,in._odata[offset]);
 | 
			
		||||
      permute(chi,tmp,ptype);
 | 
			
		||||
    } else if ( local ) {
 | 
			
		||||
      spProjTm(chi,in._odata[offset]);
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
      chi=buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
 | 
			
		||||
    mult(&Uchi(),&U._odata[ss](Tp),&chi());
 | 
			
		||||
    accumReconTm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    vstream(out._odata[ss],result);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
 | 
			
		||||
void WilsonMatrix::DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField & U,
 | 
			
		||||
				const LatticeFermion &in, LatticeFermion &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  assert((dag==0) ||(dag==1));
 | 
			
		||||
 | 
			
		||||
  assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
			
		||||
  WilsonCompressor compressor(dag);
 | 
			
		||||
  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
 | 
			
		||||
 | 
			
		||||
  if ( dag ) {
 | 
			
		||||
  st.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
 | 
			
		||||
 | 
			
		||||
  if ( dag == DaggerYes ) {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int sss=0;sss<grid->oSites();sss++){
 | 
			
		||||
      DhopSiteDag(sss,in,out);
 | 
			
		||||
    for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
      DhopSiteDag(st,U,comm_buf,sss,in,out);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
PARALLEL_FOR_LOOP
 | 
			
		||||
    for(int sss=0;sss<grid->oSites();sss++){
 | 
			
		||||
      DhopSite(sss,in,out);
 | 
			
		||||
    for(int sss=0;sss<in._grid->oSites();sss++){
 | 
			
		||||
      DhopSite(st,U,comm_buf,sss,in,out);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  conformable(in._grid,_cbgrid);    // verifies half grid
 | 
			
		||||
  conformable(in._grid,out._grid); // drops the cb check
 | 
			
		||||
 | 
			
		||||
  assert(in.checkerboard==Even);
 | 
			
		||||
  out.checkerboard = Odd;
 | 
			
		||||
 | 
			
		||||
  DhopInternal(StencilEven,UmuOdd,in,out,dag);
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  conformable(in._grid,_cbgrid);    // verifies half grid
 | 
			
		||||
  conformable(in._grid,out._grid); // drops the cb check
 | 
			
		||||
 | 
			
		||||
  assert(in.checkerboard==Odd);
 | 
			
		||||
  out.checkerboard = Even;
 | 
			
		||||
 | 
			
		||||
  DhopInternal(StencilOdd,UmuEven,in,out,dag);
 | 
			
		||||
}
 | 
			
		||||
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  conformable(in._grid,_grid); // verifies full grid
 | 
			
		||||
  conformable(in._grid,out._grid);
 | 
			
		||||
 | 
			
		||||
  out.checkerboard = in.checkerboard;
 | 
			
		||||
 | 
			
		||||
  DhopInternal(Stencil,Umu,in,out,dag);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -11,14 +11,20 @@ namespace Grid {
 | 
			
		||||
      //NB r=1;
 | 
			
		||||
    public:
 | 
			
		||||
      double                        mass;
 | 
			
		||||
      GridBase                     *grid;
 | 
			
		||||
      //      GridBase                     *    grid; // Inherited
 | 
			
		||||
      //      GridBase                     *  cbgrid;
 | 
			
		||||
 | 
			
		||||
      // Copy of the gauge field 
 | 
			
		||||
      LatticeDoubledGaugeField             Umu;
 | 
			
		||||
      //Defines the stencils for even and odd
 | 
			
		||||
      CartesianStencil Stencil; 
 | 
			
		||||
      CartesianStencil StencilEven; 
 | 
			
		||||
      CartesianStencil StencilOdd; 
 | 
			
		||||
 | 
			
		||||
      //Defines the stencil
 | 
			
		||||
      CartesianStencil              Stencil; 
 | 
			
		||||
      static const int npoint=9;
 | 
			
		||||
      // Copy of the gauge field , with even and odd subsets
 | 
			
		||||
      LatticeDoubledGaugeField Umu;
 | 
			
		||||
      LatticeDoubledGaugeField UmuEven;
 | 
			
		||||
      LatticeDoubledGaugeField UmuOdd;
 | 
			
		||||
 | 
			
		||||
      static const int npoint=8;
 | 
			
		||||
      static const std::vector<int> directions   ;
 | 
			
		||||
      static const std::vector<int> displacements;
 | 
			
		||||
      static const int Xp,Xm,Yp,Ym,Zp,Zm,Tp,Tm;
 | 
			
		||||
@@ -27,7 +33,7 @@ namespace Grid {
 | 
			
		||||
      std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  comm_buf;
 | 
			
		||||
 | 
			
		||||
      // Constructor
 | 
			
		||||
      WilsonMatrix(LatticeGaugeField &Umu,double mass);
 | 
			
		||||
      WilsonMatrix(LatticeGaugeField &_Umu,GridCartesian &Fgrid,GridRedBlackCartesian &Hgrid,double _mass);
 | 
			
		||||
 | 
			
		||||
      // DoubleStore
 | 
			
		||||
      void DoubleStore(LatticeDoubledGaugeField &Uds,const LatticeGaugeField &Umu);
 | 
			
		||||
@@ -45,9 +51,19 @@ namespace Grid {
 | 
			
		||||
      virtual void   MooeeInvDag (const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
 | 
			
		||||
      // non-hermitian hopping term; half cb or both
 | 
			
		||||
      void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag);
 | 
			
		||||
      void DhopSite   (int ss,const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
      void DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
      void Dhop  (const LatticeFermion &in, LatticeFermion &out,int dag);
 | 
			
		||||
      void DhopOE(const LatticeFermion &in, LatticeFermion &out,int dag);
 | 
			
		||||
      void DhopEO(const LatticeFermion &in, LatticeFermion &out,int dag);
 | 
			
		||||
      void DhopInternal(CartesianStencil & st,LatticeDoubledGaugeField &U,
 | 
			
		||||
			const LatticeFermion &in, LatticeFermion &out,int dag);
 | 
			
		||||
      // These ones will need to be package intelligently. WilsonType base class
 | 
			
		||||
      // for use by DWF etc..
 | 
			
		||||
      void DhopSite(CartesianStencil &st,LatticeDoubledGaugeField &U,
 | 
			
		||||
		    std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 | 
			
		||||
		    int ss,const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
      void DhopSiteDag(CartesianStencil &st,LatticeDoubledGaugeField &U,
 | 
			
		||||
		       std::vector<vHalfSpinColourVector,alignedAllocator<vHalfSpinColourVector> >  &buf,
 | 
			
		||||
		       int ss,const LatticeFermion &in, LatticeFermion &out);
 | 
			
		||||
 | 
			
		||||
      typedef iScalar<iMatrix<vComplex, Nc> > matrix;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -283,6 +283,7 @@ namespace Optimization {
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Here assign types 
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  typedef __m128 SIMD_Ftype;  // Single precision type
 | 
			
		||||
  typedef __m128d SIMD_Dtype; // Double precision type
 | 
			
		||||
  typedef __m128i SIMD_Itype; // Integer type
 | 
			
		||||
 
 | 
			
		||||
@@ -2,7 +2,7 @@
 | 
			
		||||
/*! @file Grid_vector_types.h
 | 
			
		||||
  @brief Defines templated class Grid_simd to deal with inner vector types
 | 
			
		||||
*/
 | 
			
		||||
// Time-stamp: <2015-05-22 17:08:19 neo>
 | 
			
		||||
// Time-stamp: <2015-05-26 12:05:39 neo>
 | 
			
		||||
//---------------------------------------------------------------------------
 | 
			
		||||
#ifndef GRID_VECTOR_TYPES
 | 
			
		||||
#define GRID_VECTOR_TYPES
 | 
			
		||||
@@ -21,31 +21,24 @@
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  // To take the floating point type of real/complex type
 | 
			
		||||
  template <typename T> 
 | 
			
		||||
    struct RealPart {
 | 
			
		||||
      typedef T type;
 | 
			
		||||
    };
 | 
			
		||||
  template <typename T> 
 | 
			
		||||
    struct RealPart< std::complex<T> >{
 | 
			
		||||
  template <typename T> struct RealPart {
 | 
			
		||||
    typedef T type;
 | 
			
		||||
  };
 | 
			
		||||
  template <typename T> struct RealPart< std::complex<T> >{
 | 
			
		||||
    typedef T type;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  // type alias used to simplify the syntax of std::enable_if
 | 
			
		||||
  template <typename T> using Invoke =
 | 
			
		||||
    typename T::type;
 | 
			
		||||
  template <typename Condition, typename ReturnType> using EnableIf =
 | 
			
		||||
    Invoke<std::enable_if<Condition::value, ReturnType>>;
 | 
			
		||||
  template <typename Condition, typename ReturnType> using NotEnableIf =
 | 
			
		||||
    Invoke<std::enable_if<!Condition::value, ReturnType>>;
 | 
			
		||||
  template <typename T> using Invoke                                  =  typename T::type;
 | 
			
		||||
  template <typename Condition, typename ReturnType> using EnableIf   =    Invoke<std::enable_if<Condition::value, ReturnType>>;
 | 
			
		||||
  template <typename Condition, typename ReturnType> using NotEnableIf=    Invoke<std::enable_if<!Condition::value, ReturnType>>;
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////
 | 
			
		||||
  // Check for complexity with type traits
 | 
			
		||||
  template <typename T> 
 | 
			
		||||
    struct is_complex : std::false_type {};
 | 
			
		||||
  template < typename T > 
 | 
			
		||||
    struct is_complex< std::complex<T> >: std::true_type {};
 | 
			
		||||
  template <typename T>     struct is_complex : std::false_type {};
 | 
			
		||||
  template < typename T >   struct is_complex< std::complex<T> >: std::true_type {};
 | 
			
		||||
  ////////////////////////////////////////////////////////
 | 
			
		||||
  // Define the operation templates functors
 | 
			
		||||
  // general forms to allow for vsplat syntax
 | 
			
		||||
@@ -103,8 +96,6 @@ namespace Grid {
 | 
			
		||||
      vsplat(*this,Scalar_type(a));
 | 
			
		||||
    };
 | 
			
		||||
       
 | 
			
		||||
 | 
			
		||||
       
 | 
			
		||||
    ///////////////////////////////////////////////
 | 
			
		||||
    // mac, mult, sub, add, adj
 | 
			
		||||
    ///////////////////////////////////////////////
 | 
			
		||||
@@ -146,10 +137,6 @@ namespace Grid {
 | 
			
		||||
    template <  class S = Scalar_type, EnableIf<std::is_integral < S >, int> = 0 > 
 | 
			
		||||
      friend inline void vfalse(Grid_simd &ret){vsplat(ret,0);}
 | 
			
		||||
   
 | 
			
		||||
 | 
			
		||||
   
 | 
			
		||||
 
 | 
			
		||||
   
 | 
			
		||||
    ////////////////////////////////////
 | 
			
		||||
    // Arithmetic operator overloads +,-,*
 | 
			
		||||
    ////////////////////////////////////
 | 
			
		||||
@@ -185,7 +172,6 @@ namespace Grid {
 | 
			
		||||
	return ret;
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // FIXME:  gonna remove these load/store, get, set, prefetch
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -345,20 +345,30 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 | 
			
		||||
// REDUCE FIXME must be a cleaner implementation
 | 
			
		||||
       friend inline ComplexD Reduce(const vComplexD & in)
 | 
			
		||||
       { 
 | 
			
		||||
	 vComplexD v1,v2;
 | 
			
		||||
	 union { 
 | 
			
		||||
	   zvec v;
 | 
			
		||||
	   double f[sizeof(zvec)/sizeof(double)];
 | 
			
		||||
	 } conv;
 | 
			
		||||
	   
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	 return ComplexD(in.v[0],in.v[1]);
 | 
			
		||||
	 v1=in;
 | 
			
		||||
#endif
 | 
			
		||||
#if defined(AVX1) || defined (AVX2)
 | 
			
		||||
	 vComplexD v1;
 | 
			
		||||
	 permute(v1,in,0); // sse 128; paired complex single
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 return ComplexD(v1.v[0],v1.v[1]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
	 return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
 | 
			
		||||
	 permute(v1,in,0); // sse 128; paired complex single
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 permute(v2,v1,1); // avx 256; quad complex single
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
#endif 
 | 
			
		||||
#ifdef QPX
 | 
			
		||||
#error
 | 
			
		||||
#endif
 | 
			
		||||
	 conv.v = v1.v;
 | 
			
		||||
	 return ComplexD(conv.f[0],conv.f[1]);
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        // Unary negation
 | 
			
		||||
 
 | 
			
		||||
@@ -234,26 +234,34 @@ namespace Grid {
 | 
			
		||||
	}
 | 
			
		||||
	friend inline ComplexF Reduce(const vComplexF & in)
 | 
			
		||||
	{
 | 
			
		||||
	 vComplexF v1,v2;
 | 
			
		||||
	 union { 
 | 
			
		||||
	   cvec v;
 | 
			
		||||
	   float f[sizeof(cvec)/sizeof(float)];
 | 
			
		||||
	 } conv;
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	 vComplexF v1;
 | 
			
		||||
	 permute(v1,in,0); // sse 128; paired complex single
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 return ComplexF(v1.v[0],v1.v[1]);
 | 
			
		||||
#endif
 | 
			
		||||
#if defined(AVX1) || defined (AVX2)
 | 
			
		||||
	 vComplexF v1,v2;
 | 
			
		||||
	 permute(v1,in,0); // sse 128; paired complex single
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 permute(v2,v1,1); // avx 256; quad complex single
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
	 return ComplexF(v1.v[0],v1.v[1]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
            return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
 | 
			
		||||
	 permute(v1,in,0); // avx512 octo-complex single
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 permute(v2,v1,1); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
	 permute(v2,v1,2); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef QPX
 | 
			
		||||
#error
 | 
			
		||||
#endif
 | 
			
		||||
	 conv.v = v1.v;
 | 
			
		||||
	 return ComplexF(conv.f[0],conv.f[1]);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        friend inline vComplexF operator * (const ComplexF &a, vComplexF b){
 | 
			
		||||
 
 | 
			
		||||
@@ -210,25 +210,33 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
       friend inline RealD Reduce(const vRealD & in)
 | 
			
		||||
       {
 | 
			
		||||
	 vRealD v1,v2;
 | 
			
		||||
	 union { 
 | 
			
		||||
	   dvec v;
 | 
			
		||||
	   double f[sizeof(dvec)/sizeof(double)];
 | 
			
		||||
	 } conv;
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	 vRealD v1;
 | 
			
		||||
	 permute(v1,in,0); // sse 128; paired real double
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 return RealD(v1.v[0]);
 | 
			
		||||
#endif
 | 
			
		||||
#if defined(AVX1) || defined (AVX2)
 | 
			
		||||
	 vRealD v1,v2;
 | 
			
		||||
	 permute(v1,in,0); // avx 256; quad double
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 permute(v2,v1,1); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
	 return v1.v[0];
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
            return _mm512_reduce_add_pd(in.v);
 | 
			
		||||
	 permute(v1,in,0); // avx 512; octo-double
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 permute(v2,v1,1); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
	 permute(v2,v1,2); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef QPX
 | 
			
		||||
#endif
 | 
			
		||||
	 conv.v=v1.v;
 | 
			
		||||
	 return conv.f[0];
 | 
			
		||||
       }
 | 
			
		||||
 | 
			
		||||
        // *=,+=,-= operators
 | 
			
		||||
 
 | 
			
		||||
@@ -243,29 +243,39 @@ friend inline void vstore(const vRealF &ret, float *a){
 | 
			
		||||
        }
 | 
			
		||||
       friend inline RealF Reduce(const vRealF & in)
 | 
			
		||||
       {
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	 vRealF v1,v2;
 | 
			
		||||
	 union { 
 | 
			
		||||
	   fvec v;
 | 
			
		||||
	   float f[sizeof(fvec)/sizeof(double)];
 | 
			
		||||
	 } conv;
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	 permute(v1,in,0); // sse 128; quad single
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 permute(v2,v1,1); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
	 return v1.v[0];
 | 
			
		||||
#endif
 | 
			
		||||
#if defined(AVX1) || defined (AVX2)
 | 
			
		||||
	 vRealF v1,v2;
 | 
			
		||||
	 permute(v1,in,0); // avx 256; octo-double
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 permute(v2,v1,1); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
	 permute(v2,v1,2); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
	 return v1.v[0];
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
            return _mm512_reduce_add_ps(in.v);
 | 
			
		||||
	 permute(v1,in,0); // avx 256; octo-double
 | 
			
		||||
	 v1=v1+in;
 | 
			
		||||
	 permute(v2,v1,1); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
	 permute(v2,v1,2); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
	 permute(v2,v1,3); 
 | 
			
		||||
	 v1=v1+v2;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef QPX
 | 
			
		||||
#endif
 | 
			
		||||
	 conv.v=v1.v;
 | 
			
		||||
	 return conv.f[0];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // *=,+=,-= operators
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,6 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse"
 | 
			
		||||
DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse icpc-avx-openmp-mpi icpc-avx-openmp"
 | 
			
		||||
 | 
			
		||||
for D in $DIRS
 | 
			
		||||
do
 | 
			
		||||
 
 | 
			
		||||
@@ -25,6 +25,12 @@ g++-avx)
 | 
			
		||||
icpc-avx)
 | 
			
		||||
  CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
 | 
			
		||||
  ;;
 | 
			
		||||
icpc-avx-openmp-mpi)
 | 
			
		||||
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
 | 
			
		||||
;;
 | 
			
		||||
icpc-avx-openmp)
 | 
			
		||||
CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
 | 
			
		||||
;;
 | 
			
		||||
icpc-avx2)
 | 
			
		||||
  CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
 | 
			
		||||
  ;;
 | 
			
		||||
 
 | 
			
		||||
@@ -1,8 +1,9 @@
 | 
			
		||||
#include "Grid.h"
 | 
			
		||||
 | 
			
		||||
//DEBUG
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
#include "simd/Grid_vector_types.h"
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 
 | 
			
		||||
@@ -13,7 +13,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  std::vector<int> simd_layout = GridDefaultSimd(4,vComplexF::Nsimd());
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
  std::vector<int> latt_size  ({16,16,16,32});
 | 
			
		||||
  std::vector<int> latt_size   = GridDefaultLatt();
 | 
			
		||||
  std::vector<int> clatt_size  ({4,4,4,8});
 | 
			
		||||
  int orthodir=3;
 | 
			
		||||
  int orthosz =latt_size[orthodir];
 | 
			
		||||
@@ -44,13 +44,15 @@ int main (int argc, char ** argv)
 | 
			
		||||
  // (1+2+3)=6 = N(N-1)/2 terms
 | 
			
		||||
  LatticeComplex Plaq(&Fine);
 | 
			
		||||
  LatticeComplex cPlaq(&Coarse);
 | 
			
		||||
 | 
			
		||||
  Plaq = zero;
 | 
			
		||||
#if 1
 | 
			
		||||
  for(int mu=1;mu<Nd;mu++){
 | 
			
		||||
    for(int nu=0;nu<mu;nu++){
 | 
			
		||||
      Plaq = Plaq + trace(U[mu]*Cshift(U[nu],mu,1)*adj(Cshift(U[mu],nu,1))*adj(U[nu]));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
#endif
 | 
			
		||||
  double vol = Fine.gSites();
 | 
			
		||||
  Complex PlaqScale(1.0/vol/6.0/3.0);
 | 
			
		||||
  std::cout <<"PlaqScale" << PlaqScale<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										0
									
								
								tests/InvSqrt.gnu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								tests/InvSqrt.gnu
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										2
									
								
								tests/Sqrt.gnu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								tests/Sqrt.gnu
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,2 @@
 | 
			
		||||
f(x) = 6.81384+(-2.34645e-06/(x+0.000228091))+(-1.51593e-05/(x+0.00112084))+(-6.89254e-05/(x+0.003496))+(-0.000288983/(x+0.00954309))+(-0.00119277/(x+0.024928))+(-0.0050183/(x+0.0646627))+(-0.0226449/(x+0.171576))+(-0.123767/(x+0.491792))+(-1.1705/(x+1.78667))+(-102.992/(x+18.4866));
 | 
			
		||||
f(x) = 0.14676+(0.00952992/(x+5.40933e-05))+(0.0115952/(x+0.000559699))+(0.0161824/(x+0.00203338))+(0.0243252/(x+0.00582831))+(0.0379533/(x+0.0154649))+(0.060699/(x+0.0401156))+(0.100345/(x+0.104788))+(0.178335/(x+0.286042))+(0.381586/(x+0.892189))+(1.42625/(x+4.38422));
 | 
			
		||||
		Reference in New Issue
	
	Block a user