mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Merge commit '899ca41cb8c8f47771bfd37cd895cbc2184e5560'
This commit is contained in:
		@@ -24,7 +24,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::vector<int> latt4 = GridDefaultLatt();
 | 
					  std::vector<int> latt4 = GridDefaultLatt();
 | 
				
			||||||
  const int Ls=8;
 | 
					  const int Ls=16;
 | 
				
			||||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
					  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
				
			||||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
					  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
				
			||||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
					  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
				
			||||||
@@ -82,22 +82,25 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
					  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
 | 
				
			||||||
  int ncall=10;
 | 
					  int ncall=100;
 | 
				
			||||||
  double t0=usecond();
 | 
					  {
 | 
				
			||||||
  for(int i=0;i<ncall;i++){
 | 
					    double t0=usecond();
 | 
				
			||||||
    Dw.Dhop(src,result,0);
 | 
					    for(int i=0;i<ncall;i++){
 | 
				
			||||||
  }
 | 
					      Dw.Dhop(src,result,0);
 | 
				
			||||||
  double t1=usecond();
 | 
					    }
 | 
				
			||||||
 | 
					    double t1=usecond();
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
				
			||||||
 | 
					    double flops=1344*volume*ncall;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
					    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
 | 
				
			||||||
  double flops=1344*volume*ncall;
 | 
					    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
				
			||||||
  
 | 
					    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "Called Dw"<<std::endl;
 | 
					    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
					    err = ref-result; 
 | 
				
			||||||
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
					    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
					    Dw.Report();
 | 
				
			||||||
  err = ref-result; 
 | 
					  }
 | 
				
			||||||
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (1)
 | 
					  if (1)
 | 
				
			||||||
@@ -140,6 +143,18 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
 | 
					  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
 | 
					  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    double t0=usecond();
 | 
				
			||||||
 | 
					    for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					      Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    double t1=usecond();
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
				
			||||||
 | 
					    double flops=(1344.0*volume*ncall)/2;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
					  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
				
			||||||
  Dw.DhopOE(src_e,r_o,DaggerNo);
 | 
					  Dw.DhopOE(src_e,r_o,DaggerNo);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -8,7 +8,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
{
 | 
					{
 | 
				
			||||||
  Grid_init(&argc,&argv);
 | 
					  Grid_init(&argc,&argv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  const int Nvec=4;
 | 
					  const int Nvec=8;
 | 
				
			||||||
  typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
 | 
					  typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
 | 
				
			||||||
  typedef iVector<vReal,Nvec> Vec;
 | 
					  typedef iVector<vReal,Nvec> Vec;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -28,7 +28,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  const int lmax = 16536*16;
 | 
					  const int lmax = 16536*16;
 | 
				
			||||||
  for(int lat=4;lat<=lmax;lat*=2){
 | 
					  for(int lat=4;lat<=lmax;lat*=2){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    int Nloop=lmax*128*4/lat;
 | 
					    int Nloop=lmax*4/lat;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    std::vector<int> latt_size  ({2*mpi_layout[0],2*mpi_layout[1],4*mpi_layout[2],lat*mpi_layout[3]});
 | 
					    std::vector<int> latt_size  ({2*mpi_layout[0],2*mpi_layout[1],4*mpi_layout[2],lat*mpi_layout[3]});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -45,7 +45,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    std::vector<LatticeVec> x(threads,&Grid);
 | 
					    std::vector<LatticeVec> x(threads,&Grid);
 | 
				
			||||||
    for(int t=0;t<threads;t++){
 | 
					    for(int t=0;t<threads;t++){
 | 
				
			||||||
      random(pRNG,x[t]);
 | 
					      //      random(pRNG,x[t]);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    double start=usecond();
 | 
					    double start=usecond();
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -28,7 +28,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
 | 
				
			||||||
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 | 
				
			||||||
  uint64_t lmax=44;
 | 
					  uint64_t lmax=44;
 | 
				
			||||||
#define NLOOP (100*lmax*lmax*lmax*lmax/vol)
 | 
					#define NLOOP (1*lmax*lmax*lmax*lmax/vol)
 | 
				
			||||||
  for(int lat=4;lat<=lmax;lat+=4){
 | 
					  for(int lat=4;lat<=lmax;lat+=4){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
					      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
				
			||||||
@@ -37,11 +37,11 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      uint64_t Nloop=NLOOP;
 | 
					      uint64_t Nloop=NLOOP;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
					      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      LatticeVec z(&Grid); random(pRNG,z);
 | 
					      LatticeVec z(&Grid); //random(pRNG,z);
 | 
				
			||||||
      LatticeVec x(&Grid); random(pRNG,x);
 | 
					      LatticeVec x(&Grid); //random(pRNG,x);
 | 
				
			||||||
      LatticeVec y(&Grid); random(pRNG,y);
 | 
					      LatticeVec y(&Grid); //random(pRNG,y);
 | 
				
			||||||
      double a=2.0;
 | 
					      double a=2.0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -72,11 +72,11 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
					      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
				
			||||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
					      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
					      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      LatticeVec z(&Grid); random(pRNG,z);
 | 
					      LatticeVec z(&Grid); //random(pRNG,z);
 | 
				
			||||||
      LatticeVec x(&Grid); random(pRNG,x);
 | 
					      LatticeVec x(&Grid); //random(pRNG,x);
 | 
				
			||||||
      LatticeVec y(&Grid); random(pRNG,y);
 | 
					      LatticeVec y(&Grid); //random(pRNG,y);
 | 
				
			||||||
      double a=2.0;
 | 
					      double a=2.0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      uint64_t Nloop=NLOOP;
 | 
					      uint64_t Nloop=NLOOP;
 | 
				
			||||||
@@ -110,11 +110,11 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
					      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
					      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      LatticeVec z(&Grid); random(pRNG,z);
 | 
					      LatticeVec z(&Grid); //random(pRNG,z);
 | 
				
			||||||
      LatticeVec x(&Grid); random(pRNG,x);
 | 
					      LatticeVec x(&Grid); //random(pRNG,x);
 | 
				
			||||||
      LatticeVec y(&Grid); random(pRNG,y);
 | 
					      LatticeVec y(&Grid); //random(pRNG,y);
 | 
				
			||||||
      RealD a=2.0;
 | 
					      RealD a=2.0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -145,10 +145,10 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
      uint64_t Nloop=NLOOP;
 | 
					      uint64_t Nloop=NLOOP;
 | 
				
			||||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
					      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
					      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedRandomDevice();
 | 
				
			||||||
      LatticeVec z(&Grid); random(pRNG,z);
 | 
					      LatticeVec z(&Grid); //random(pRNG,z);
 | 
				
			||||||
      LatticeVec x(&Grid); random(pRNG,x);
 | 
					      LatticeVec x(&Grid); //random(pRNG,x);
 | 
				
			||||||
      LatticeVec y(&Grid); random(pRNG,y);
 | 
					      LatticeVec y(&Grid); //random(pRNG,y);
 | 
				
			||||||
      RealD a=2.0;
 | 
					      RealD a=2.0;
 | 
				
			||||||
      Real nn;      
 | 
					      Real nn;      
 | 
				
			||||||
      double start=usecond();
 | 
					      double start=usecond();
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -90,7 +90,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
 | 
					  WilsonFermionR Dw(Umu,Grid,RBGrid,mass);
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
 | 
					  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
 | 
				
			||||||
  int ncall=10000;
 | 
					  int ncall=1000;
 | 
				
			||||||
  double t0=usecond();
 | 
					  double t0=usecond();
 | 
				
			||||||
  for(int i=0;i<ncall;i++){
 | 
					  for(int i=0;i<ncall;i++){
 | 
				
			||||||
    Dw.Dhop(src,result,0);
 | 
					    Dw.Dhop(src,result,0);
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										12
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								configure.ac
									
									
									
									
									
								
							@@ -38,6 +38,7 @@ AC_CHECK_HEADERS(mm_malloc.h)
 | 
				
			|||||||
AC_CHECK_HEADERS(malloc/malloc.h)
 | 
					AC_CHECK_HEADERS(malloc/malloc.h)
 | 
				
			||||||
AC_CHECK_HEADERS(malloc.h)
 | 
					AC_CHECK_HEADERS(malloc.h)
 | 
				
			||||||
AC_CHECK_HEADERS(endian.h)
 | 
					AC_CHECK_HEADERS(endian.h)
 | 
				
			||||||
 | 
					AC_CHECK_HEADERS(execinfo.h)
 | 
				
			||||||
AC_CHECK_HEADERS(gmp.h)
 | 
					AC_CHECK_HEADERS(gmp.h)
 | 
				
			||||||
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 | 
					AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 | 
				
			||||||
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
 | 
					AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
 | 
				
			||||||
@@ -65,7 +66,7 @@ AC_CHECK_FUNCS([gettimeofday])
 | 
				
			|||||||
#Please install or provide the correct path to your installation
 | 
					#Please install or provide the correct path to your installation
 | 
				
			||||||
#Info at: http://www.mpfr.org/)])
 | 
					#Info at: http://www.mpfr.org/)])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|IMCI],\
 | 
					AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
 | 
				
			||||||
	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
 | 
						[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
 | 
				
			||||||
	[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
 | 
						[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -90,6 +91,15 @@ case ${ac_SIMD} in
 | 
				
			|||||||
       	AC_MSG_WARN([Your processor does not support AVX instructions])
 | 
					       	AC_MSG_WARN([Your processor does not support AVX instructions])
 | 
				
			||||||
       fi
 | 
					       fi
 | 
				
			||||||
     ;;
 | 
					     ;;
 | 
				
			||||||
 | 
					     AVXFMA4)
 | 
				
			||||||
 | 
					       echo Configuring for AVX
 | 
				
			||||||
 | 
					       AC_DEFINE([AVXFMA4],[1],[AVX Intrinsics with FMA4] )
 | 
				
			||||||
 | 
					       if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
 | 
				
			||||||
 | 
					       supported=yes			  
 | 
				
			||||||
 | 
					       else
 | 
				
			||||||
 | 
					       	AC_MSG_WARN([Your processor does not support AVX instructions])
 | 
				
			||||||
 | 
					       fi
 | 
				
			||||||
 | 
					     ;;
 | 
				
			||||||
     AVX2)
 | 
					     AVX2)
 | 
				
			||||||
       echo Configuring for AVX2
 | 
					       echo Configuring for AVX2
 | 
				
			||||||
       AC_DEFINE([AVX2],[1],[AVX2 Intrinsics] )
 | 
					       AC_DEFINE([AVX2],[1],[AVX2 Intrinsics] )
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -18,8 +18,8 @@
 | 
				
			|||||||
#include <algorithms/iterative/ConjugateGradientMultiShift.h>
 | 
					#include <algorithms/iterative/ConjugateGradientMultiShift.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Lanczos support
 | 
					// Lanczos support
 | 
				
			||||||
//#include <algorithms/iterative/MatrixUtils.h>
 | 
					#include <algorithms/iterative/MatrixUtils.h>
 | 
				
			||||||
//#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
 | 
					#include <algorithms/iterative/ImplicitlyRestartedLanczos.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <algorithms/CoarsenedMatrix.h>
 | 
					#include <algorithms/CoarsenedMatrix.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -9,6 +9,9 @@
 | 
				
			|||||||
/* AVX512 Intrinsics for Knights Landing */
 | 
					/* AVX512 Intrinsics for Knights Landing */
 | 
				
			||||||
#undef AVX512
 | 
					#undef AVX512
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* AVX Intrinsics with FMA4 */
 | 
				
			||||||
 | 
					#undef AVXFMA4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* EMPTY_SIMD only for DEBUGGING */
 | 
					/* EMPTY_SIMD only for DEBUGGING */
 | 
				
			||||||
#undef EMPTY_SIMD
 | 
					#undef EMPTY_SIMD
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										33
									
								
								lib/Init.cc
									
									
									
									
									
								
							
							
						
						
									
										33
									
								
								lib/Init.cc
									
									
									
									
									
								
							@@ -1,7 +1,6 @@
 | 
				
			|||||||
/****************************************************************************/
 | 
					/****************************************************************************/
 | 
				
			||||||
/* pab: Signal magic. Processor state dump is x86-64 specific               */
 | 
					/* pab: Signal magic. Processor state dump is x86-64 specific               */
 | 
				
			||||||
/****************************************************************************/
 | 
					/****************************************************************************/
 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <stdlib.h>
 | 
					#include <stdlib.h>
 | 
				
			||||||
#include <stdio.h>
 | 
					#include <stdio.h>
 | 
				
			||||||
#include <stdint.h>
 | 
					#include <stdint.h>
 | 
				
			||||||
@@ -16,10 +15,10 @@
 | 
				
			|||||||
#include <algorithm>
 | 
					#include <algorithm>
 | 
				
			||||||
#include <iterator>
 | 
					#include <iterator>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef __X86_64
 | 
					#define __X86_64
 | 
				
			||||||
#define MAC
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef MAC
 | 
					
 | 
				
			||||||
 | 
					#ifdef HAVE_EXECINFO_H
 | 
				
			||||||
#include <execinfo.h>
 | 
					#include <execinfo.h>
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -31,8 +30,10 @@ namespace Grid {
 | 
				
			|||||||
//////////////////////////////////////////////////////
 | 
					//////////////////////////////////////////////////////
 | 
				
			||||||
static std::vector<int> Grid_default_latt;
 | 
					static std::vector<int> Grid_default_latt;
 | 
				
			||||||
static std::vector<int> Grid_default_mpi;
 | 
					static std::vector<int> Grid_default_mpi;
 | 
				
			||||||
int GridThread::_threads;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int GridThread::_threads =1;
 | 
				
			||||||
 | 
					int GridThread::_hyperthreads=1;
 | 
				
			||||||
 | 
					int GridThread::_cores=1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
const std::vector<int> &GridDefaultLatt(void)     {return Grid_default_latt;};
 | 
					const std::vector<int> &GridDefaultLatt(void)     {return Grid_default_latt;};
 | 
				
			||||||
const std::vector<int> &GridDefaultMpi(void)      {return Grid_default_mpi;};
 | 
					const std::vector<int> &GridDefaultMpi(void)      {return Grid_default_mpi;};
 | 
				
			||||||
@@ -118,13 +119,19 @@ void GridParseLayout(char **argv,int argc,
 | 
				
			|||||||
    arg= GridCmdOptionPayload(argv,argv+argc,"--grid");
 | 
					    arg= GridCmdOptionPayload(argv,argv+argc,"--grid");
 | 
				
			||||||
    GridCmdOptionIntVector(arg,latt);
 | 
					    GridCmdOptionIntVector(arg,latt);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  if( GridCmdOptionExists(argv,argv+argc,"--omp") ){
 | 
					  if( GridCmdOptionExists(argv,argv+argc,"--threads") ){
 | 
				
			||||||
    std::vector<int> ompthreads(0);
 | 
					    std::vector<int> ompthreads(0);
 | 
				
			||||||
    arg= GridCmdOptionPayload(argv,argv+argc,"--omp");
 | 
					    arg= GridCmdOptionPayload(argv,argv+argc,"--threads");
 | 
				
			||||||
    GridCmdOptionIntVector(arg,ompthreads);
 | 
					    GridCmdOptionIntVector(arg,ompthreads);
 | 
				
			||||||
    assert(ompthreads.size()==1);
 | 
					    assert(ompthreads.size()==1);
 | 
				
			||||||
    GridThread::SetThreads(ompthreads[0]);
 | 
					    GridThread::SetThreads(ompthreads[0]);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
 | 
				
			||||||
 | 
					    std::vector<int> cores(0);
 | 
				
			||||||
 | 
					    arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
 | 
				
			||||||
 | 
					    GridCmdOptionIntVector(arg,cores);
 | 
				
			||||||
 | 
					    GridThread::SetCores(cores[0]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -183,6 +190,11 @@ void Grid_init(int *argc,char ***argv)
 | 
				
			|||||||
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
 | 
					  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
 | 
				
			||||||
    LebesgueOrder::UseLebesgueOrder=1;
 | 
					    LebesgueOrder::UseLebesgueOrder=1;
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
 | 
				
			||||||
 | 
					    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
 | 
				
			||||||
 | 
					    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
  GridParseLayout(*argv,*argc,
 | 
					  GridParseLayout(*argv,*argc,
 | 
				
			||||||
		  Grid_default_latt,
 | 
							  Grid_default_latt,
 | 
				
			||||||
		  Grid_default_mpi);
 | 
							  Grid_default_mpi);
 | 
				
			||||||
@@ -222,11 +234,14 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
				
			|||||||
  printf("  mem address %llx\n",(unsigned long long)si->si_addr);
 | 
					  printf("  mem address %llx\n",(unsigned long long)si->si_addr);
 | 
				
			||||||
  printf("         code %d\n",si->si_code);
 | 
					  printf("         code %d\n",si->si_code);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef __X86_64
 | 
					  // Linux/Posix
 | 
				
			||||||
 | 
					#ifdef __linux__ 
 | 
				
			||||||
 | 
					  // And x86 64bit
 | 
				
			||||||
    ucontext_t * uc= (ucontext_t *)ptr;
 | 
					    ucontext_t * uc= (ucontext_t *)ptr;
 | 
				
			||||||
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
 | 
					  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
 | 
				
			||||||
  printf("  instruction %llx\n",(unsigned long long)sc->rip);
 | 
					  printf("  instruction %llx\n",(unsigned long long)sc->rip);
 | 
				
			||||||
#define REG(A)  printf("  %s %lx\n",#A,sc-> A);
 | 
					#define REG(A)  printf("  %s %lx\n",#A,sc-> A);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  REG(rdi);
 | 
					  REG(rdi);
 | 
				
			||||||
  REG(rsi);
 | 
					  REG(rsi);
 | 
				
			||||||
  REG(rbp);
 | 
					  REG(rbp);
 | 
				
			||||||
@@ -247,7 +262,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 | 
				
			|||||||
  REG(r14);
 | 
					  REG(r14);
 | 
				
			||||||
  REG(r15);
 | 
					  REG(r15);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#ifdef MAC
 | 
					#ifdef HAVE_EXECINFO_H
 | 
				
			||||||
  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);
 | 
					  int symbols    = backtrace        (Grid_backtrace_buffer,_NBACKTRACE);
 | 
				
			||||||
  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
 | 
					  char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
 | 
				
			||||||
  for (int i = 0; i < symbols; i++){
 | 
					  for (int i = 0; i < symbols; i++){
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										28
									
								
								lib/PerfCount.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								lib/PerfCount.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,28 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					#include <PerfCount.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
 | 
				
			||||||
 | 
					const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
 | 
				
			||||||
 | 
					  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." },
 | 
				
			||||||
 | 
					  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." },
 | 
				
			||||||
 | 
					  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES    ,  "CACHE_REFERENCES..." },
 | 
				
			||||||
 | 
					  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES        ,  "CACHE_MISSES......." },
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS)       ,  "L1D_READ_MISS......"},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS)     ,  "L1D_READ_ACCESS...."},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS)      ,  "L1D_WRITE_MISS....."},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS)    ,  "L1D_WRITE_ACCESS..."},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS)   ,  "L1D_PREFETCH_MISS.."},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) ,  "L1D_PREFETCH_ACCESS"},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS)        ,  "LL_READ_MISS......."},
 | 
				
			||||||
 | 
					  //  { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS)      ,  "LL_READ_ACCESS....."},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS)       ,  "LL_WRITE_MISS......"},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS)     ,  "LL_WRITE_ACCESS...."},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS)    ,  "LL_PREFETCH_MISS..."},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS)  ,  "LL_PREFETCH_ACCESS."},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS)       ,  "L1I_READ_MISS......"},
 | 
				
			||||||
 | 
					  { PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS)     ,  "L1I_READ_ACCESS...."}
 | 
				
			||||||
 | 
					  //  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										133
									
								
								lib/PerfCount.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										133
									
								
								lib/PerfCount.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,133 @@
 | 
				
			|||||||
 | 
					#ifndef GRID_PERFCOUNT_H
 | 
				
			||||||
 | 
					#define GRID_PERFCOUNT_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <sys/time.h>
 | 
				
			||||||
 | 
					#include <ctime>
 | 
				
			||||||
 | 
					#include <chrono>
 | 
				
			||||||
 | 
					#include <string.h>
 | 
				
			||||||
 | 
					#include <sys/ioctl.h>
 | 
				
			||||||
 | 
					#include <syscall.h>
 | 
				
			||||||
 | 
					#include <linux/perf_event.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
 | 
				
			||||||
 | 
								    int cpu, int group_fd, unsigned long flags)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int ret;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
 | 
				
			||||||
 | 
							group_fd, flags);
 | 
				
			||||||
 | 
					  return ret;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class PerformanceCounter {
 | 
				
			||||||
 | 
					private:
 | 
				
			||||||
 | 
					  typedef struct { 
 | 
				
			||||||
 | 
					  public:
 | 
				
			||||||
 | 
					    uint32_t type;
 | 
				
			||||||
 | 
					    uint64_t config;
 | 
				
			||||||
 | 
					    const char *name;
 | 
				
			||||||
 | 
					  } PerformanceCounterConfig; 
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  static const PerformanceCounterConfig PerformanceCounterConfigs [];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					public:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  enum PerformanceCounterType {
 | 
				
			||||||
 | 
					    CPUCYCLES=0,
 | 
				
			||||||
 | 
					    INSTRUCTIONS,
 | 
				
			||||||
 | 
					    //    STALL_CYCLES,
 | 
				
			||||||
 | 
					    CACHE_REFERENCES,
 | 
				
			||||||
 | 
					    CACHE_MISSES,
 | 
				
			||||||
 | 
					    L1D_READ_MISS,
 | 
				
			||||||
 | 
					    L1D_READ_ACCESS,
 | 
				
			||||||
 | 
					    L1D_WRITE_MISS,
 | 
				
			||||||
 | 
					    L1D_WRITE_ACCESS,
 | 
				
			||||||
 | 
					    L1D_PREFETCH_MISS,
 | 
				
			||||||
 | 
					    L1D_PREFETCH_ACCESS,
 | 
				
			||||||
 | 
					    LL_READ_MISS,
 | 
				
			||||||
 | 
					    //    LL_READ_ACCESS,
 | 
				
			||||||
 | 
					    LL_WRITE_MISS,
 | 
				
			||||||
 | 
					    LL_WRITE_ACCESS,
 | 
				
			||||||
 | 
					    LL_PREFETCH_MISS,
 | 
				
			||||||
 | 
					    LL_PREFETCH_ACCESS,
 | 
				
			||||||
 | 
					    L1I_READ_MISS,
 | 
				
			||||||
 | 
					    L1I_READ_ACCESS,
 | 
				
			||||||
 | 
					    PERFORMANCE_COUNTER_NUM_TYPES
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					public:
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					  int PCT;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  struct perf_event_attr pe;
 | 
				
			||||||
 | 
					  long long count;
 | 
				
			||||||
 | 
					  int fd;
 | 
				
			||||||
 | 
					  uint64_t elapsed;
 | 
				
			||||||
 | 
					  uint64_t begin;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  static int NumTypes(void){ 
 | 
				
			||||||
 | 
					    return PERFORMANCE_COUNTER_NUM_TYPES;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  PerformanceCounter(int _pct) {
 | 
				
			||||||
 | 
					    assert(_pct>=0);
 | 
				
			||||||
 | 
					    assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
 | 
				
			||||||
 | 
					    fd=-1;
 | 
				
			||||||
 | 
					    count=0;
 | 
				
			||||||
 | 
					    PCT =_pct;
 | 
				
			||||||
 | 
					    Open();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  void Open(void) 
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    memset(&pe, 0, sizeof(struct perf_event_attr));
 | 
				
			||||||
 | 
					    pe.size = sizeof(struct perf_event_attr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pe.disabled = 1;
 | 
				
			||||||
 | 
					    pe.exclude_kernel = 1;
 | 
				
			||||||
 | 
					    pe.exclude_hv = 1;
 | 
				
			||||||
 | 
					    pe.inherit    = 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pe.type  = PerformanceCounterConfigs[PCT].type;
 | 
				
			||||||
 | 
					    pe.config= PerformanceCounterConfigs[PCT].config;
 | 
				
			||||||
 | 
					    const char * name = PerformanceCounterConfigs[PCT].name;
 | 
				
			||||||
 | 
					    fd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
 | 
				
			||||||
 | 
					    if (fd == -1) {
 | 
				
			||||||
 | 
					      fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
 | 
				
			||||||
 | 
					      perror("Error is");
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  void Start(void)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    if ( fd!= -1) {
 | 
				
			||||||
 | 
					      ioctl(fd, PERF_EVENT_IOC_RESET, 0);
 | 
				
			||||||
 | 
					      ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    begin  =__rdtsc();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  void Stop(void) {
 | 
				
			||||||
 | 
					    count=0;
 | 
				
			||||||
 | 
					    if ( fd!= -1) {
 | 
				
			||||||
 | 
					      ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
 | 
				
			||||||
 | 
					      ::read(fd, &count, sizeof(long long));
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    elapsed = __rdtsc() - begin;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  void Report(void) {
 | 
				
			||||||
 | 
					    printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ~PerformanceCounter()
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    close(fd);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
							
								
								
									
										405
									
								
								lib/Stencil.h
									
									
									
									
									
								
							
							
						
						
									
										405
									
								
								lib/Stencil.h
									
									
									
									
									
								
							@@ -48,10 +48,14 @@ namespace Grid {
 | 
				
			|||||||
    int _around_the_world;
 | 
					    int _around_the_world;
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  template<class vobj,class cobj, class compressor>
 | 
				
			||||||
  class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
 | 
					  class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
 | 
				
			||||||
  public:
 | 
					  public:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      typedef uint32_t StencilInteger;
 | 
					      typedef uint32_t StencilInteger;
 | 
				
			||||||
 | 
					      typedef typename cobj::vector_type vector_type;
 | 
				
			||||||
 | 
					      typedef typename cobj::scalar_type scalar_type;
 | 
				
			||||||
 | 
					      typedef typename cobj::scalar_object scalar_object;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      int                               _checkerboard;
 | 
					      int                               _checkerboard;
 | 
				
			||||||
      int                               _npoints; // Move to template param?
 | 
					      int                               _npoints; // Move to template param?
 | 
				
			||||||
@@ -66,33 +70,334 @@ namespace Grid {
 | 
				
			|||||||
      // npoints x Osites() of these
 | 
					      // npoints x Osites() of these
 | 
				
			||||||
      std::vector<std::vector<StencilEntry> > _entries;
 | 
					      std::vector<std::vector<StencilEntry> > _entries;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Comms buffers
 | 
				
			||||||
 | 
					      std::vector<std::vector<scalar_object> > send_buf_extract;
 | 
				
			||||||
 | 
					      std::vector<std::vector<scalar_object> > recv_buf_extract;
 | 
				
			||||||
 | 
					      std::vector<scalar_object *> pointers;
 | 
				
			||||||
 | 
					      std::vector<scalar_object *> rpointers;
 | 
				
			||||||
 | 
					      Vector<cobj> send_buf;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
 | 
					      inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      int _unified_buffer_size;
 | 
					      int _unified_buffer_size;
 | 
				
			||||||
      int _request_count;
 | 
					      int _request_count;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      double buftime;
 | 
				
			||||||
 | 
					      double gathertime;
 | 
				
			||||||
 | 
					      double commtime;
 | 
				
			||||||
 | 
					      double commstime;
 | 
				
			||||||
 | 
					      double halotime;
 | 
				
			||||||
 | 
					      double scattertime;
 | 
				
			||||||
 | 
					      double mergetime;
 | 
				
			||||||
 | 
					      double gathermtime;
 | 
				
			||||||
 | 
					      double splicetime;
 | 
				
			||||||
 | 
					      double nosplicetime;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      CartesianStencil(GridBase *grid,
 | 
					
 | 
				
			||||||
		       int npoints,
 | 
					
 | 
				
			||||||
		       int checkerboard,
 | 
					
 | 
				
			||||||
		       const std::vector<int> &directions,
 | 
					  CartesianStencil(GridBase *grid,
 | 
				
			||||||
		       const std::vector<int> &distances);
 | 
									     int npoints,
 | 
				
			||||||
 | 
									     int checkerboard,
 | 
				
			||||||
 | 
									     const std::vector<int> &directions,
 | 
				
			||||||
 | 
									     const std::vector<int> &distances) 
 | 
				
			||||||
 | 
					    :   _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      gathertime=0;
 | 
				
			||||||
 | 
					      commtime=0;
 | 
				
			||||||
 | 
					      commstime=0;
 | 
				
			||||||
 | 
					      halotime=0;
 | 
				
			||||||
 | 
					      scattertime=0;
 | 
				
			||||||
 | 
					      mergetime=0;
 | 
				
			||||||
 | 
					      gathermtime=0;
 | 
				
			||||||
 | 
					      buftime=0;
 | 
				
			||||||
 | 
					      splicetime=0;
 | 
				
			||||||
 | 
					      nosplicetime=0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      _npoints = npoints;
 | 
				
			||||||
 | 
					      _grid    = grid;
 | 
				
			||||||
 | 
					      _directions = directions;
 | 
				
			||||||
 | 
					      _distances  = distances;
 | 
				
			||||||
 | 
					      _unified_buffer_size=0;
 | 
				
			||||||
 | 
					      _request_count =0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      int osites  = _grid->oSites();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      for(int i=0;i<npoints;i++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int point = i;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						_entries[i].resize( osites);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int dimension    = directions[i];
 | 
				
			||||||
 | 
						int displacement = distances[i];
 | 
				
			||||||
 | 
						int shift = displacement;
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						int fd = _grid->_fdimensions[dimension];
 | 
				
			||||||
 | 
						int rd = _grid->_rdimensions[dimension];
 | 
				
			||||||
 | 
						_permute_type[point]=_grid->PermuteType(dimension);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						_checkerboard = checkerboard;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						// the permute type
 | 
				
			||||||
 | 
						int simd_layout     = _grid->_simd_layout[dimension];
 | 
				
			||||||
 | 
						int comm_dim        = _grid->_processors[dimension] >1 ;
 | 
				
			||||||
 | 
						int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int sshift[2];
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						// Underlying approach. For each local site build
 | 
				
			||||||
 | 
						// up a table containing the npoint "neighbours" and whether they 
 | 
				
			||||||
 | 
						// live in lattice or a comms buffer.
 | 
				
			||||||
 | 
						if ( !comm_dim ) {
 | 
				
			||||||
 | 
						  sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
 | 
				
			||||||
 | 
						  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  if ( sshift[0] == sshift[1] ) {
 | 
				
			||||||
 | 
						    Local(point,dimension,shift,0x3);
 | 
				
			||||||
 | 
						  } else {
 | 
				
			||||||
 | 
						    Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 | 
				
			||||||
 | 
						    Local(point,dimension,shift,0x2);// both with block stride loop iteration
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						} else { // All permute extract done in comms phase prior to Stencil application
 | 
				
			||||||
 | 
						  //        So tables are the same whether comm_dim or splice_dim
 | 
				
			||||||
 | 
						  sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
 | 
				
			||||||
 | 
						  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
 | 
				
			||||||
 | 
						  if ( sshift[0] == sshift[1] ) {
 | 
				
			||||||
 | 
						    Comms(point,dimension,shift,0x3);
 | 
				
			||||||
 | 
						    //	    std::cout<<"Comms 0x3"<<std::endl;
 | 
				
			||||||
 | 
						  } else {
 | 
				
			||||||
 | 
						    Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 | 
				
			||||||
 | 
						    Comms(point,dimension,shift,0x2);// both with block stride loop iteration
 | 
				
			||||||
 | 
						    //	    std::cout<<"Comms 0x1 ; 0x2"<<std::endl;
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						//	for(int ss=0;ss<osites;ss++){
 | 
				
			||||||
 | 
						  //	  std::cout << "point["<<i<<"] "<<ss<<"-> o"<<_entries[i][ss]._offset<<"; l"<<
 | 
				
			||||||
 | 
						  //	    _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<<std::endl;
 | 
				
			||||||
 | 
						//	}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    void Local     (int point, int dimension,int shiftpm,int cbmask)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      int fd = _grid->_fdimensions[dimension];
 | 
				
			||||||
 | 
					      int rd = _grid->_rdimensions[dimension];
 | 
				
			||||||
 | 
					      int ld = _grid->_ldimensions[dimension];
 | 
				
			||||||
 | 
					      int gd = _grid->_gdimensions[dimension];
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      // Map to always positive shift modulo global full dimension.
 | 
				
			||||||
 | 
					      int shift = (shiftpm+fd)%fd;
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      // the permute type
 | 
				
			||||||
 | 
					      int permute_dim =_grid->PermuteDim(dimension);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      for(int x=0;x<rd;x++){       
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						int o   = 0;
 | 
				
			||||||
 | 
						int bo  = x * _grid->_ostride[dimension];
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						int cb= (cbmask==0x2)? Odd : Even;
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
 | 
				
			||||||
 | 
						int sx     = (x+sshift)%rd;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int wraparound=0;
 | 
				
			||||||
 | 
						if ( (shiftpm==-1) && (sx>x)  ) {
 | 
				
			||||||
 | 
						  wraparound = 1;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						if ( (shiftpm== 1) && (sx<x)  ) {
 | 
				
			||||||
 | 
						  wraparound = 1;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						int permute_slice=0;
 | 
				
			||||||
 | 
						if(permute_dim){
 | 
				
			||||||
 | 
						  int wrap = sshift/rd;
 | 
				
			||||||
 | 
						  int  num = sshift%rd;
 | 
				
			||||||
 | 
						  if ( x< rd-num ) permute_slice=wrap;
 | 
				
			||||||
 | 
						  else permute_slice = 1-wrap;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  	CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    void Comms     (int point,int dimension,int shiftpm,int cbmask)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      GridBase *grid=_grid;
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      int fd              = _grid->_fdimensions[dimension];
 | 
				
			||||||
 | 
					      int ld              = _grid->_ldimensions[dimension];
 | 
				
			||||||
 | 
					      int rd              = _grid->_rdimensions[dimension];
 | 
				
			||||||
 | 
					      int pd              = _grid->_processors[dimension];
 | 
				
			||||||
 | 
					      int simd_layout     = _grid->_simd_layout[dimension];
 | 
				
			||||||
 | 
					      int comm_dim        = _grid->_processors[dimension] >1 ;
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      //      assert(simd_layout==1); // Why?
 | 
				
			||||||
 | 
					      assert(comm_dim==1);
 | 
				
			||||||
 | 
					      int shift = (shiftpm + fd) %fd;
 | 
				
			||||||
 | 
					      assert(shift>=0);
 | 
				
			||||||
 | 
					      assert(shift<fd);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 | 
				
			||||||
 | 
					      _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
 | 
				
			||||||
 | 
					                                           // send to one or more remote nodes.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      int cb= (cbmask==0x2)? Odd : Even;
 | 
				
			||||||
 | 
					      int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      for(int x=0;x<rd;x++){       
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int sx        =  (x+sshift)%rd;
 | 
				
			||||||
 | 
						int comm_proc = ((x+sshift)/rd)%pd;
 | 
				
			||||||
 | 
					    	int offnode = (comm_proc!= 0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						//	std::cout << "Stencil shift "<<shift<<" sshift "<<sshift<<" fd "<<fd<<" rd " <<rd<<" offnode "<<offnode<<" sx "<<sx<<std::endl;
 | 
				
			||||||
 | 
						int wraparound=0;
 | 
				
			||||||
 | 
						if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
 | 
				
			||||||
 | 
						  wraparound = 1;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
 | 
				
			||||||
 | 
						  wraparound = 1;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						if (!offnode) {
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  int permute_slice=0;
 | 
				
			||||||
 | 
						  CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); 
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						} else {
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  int words = buffer_size;
 | 
				
			||||||
 | 
						  if (cbmask != 0x3) words=words>>1;
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  //	  GatherPlaneSimple (point,dimension,sx,cbmask);
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  int rank           = grid->_processor;
 | 
				
			||||||
 | 
						  int recv_from_rank;
 | 
				
			||||||
 | 
						  int xmit_to_rank;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  int unified_buffer_offset = _unified_buffer_size;
 | 
				
			||||||
 | 
						  _unified_buffer_size    += words;
 | 
				
			||||||
 | 
						  ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  // Routine builds up integer table for each site in _offsets, _is_local, _permute
 | 
				
			||||||
 | 
					  void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      int rd = _grid->_rdimensions[dimension];
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      if ( !_grid->CheckerBoarded(dimension) ) {
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						int o   = 0;                                     // relative offset to base within plane
 | 
				
			||||||
 | 
						int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
 | 
				
			||||||
 | 
						int lo  = lplane*_grid->_ostride[dimension]; // offset in buffer
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						// Simple block stride gather of SIMD objects
 | 
				
			||||||
 | 
						for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 | 
				
			||||||
 | 
						  for(int b=0;b<_grid->_slice_block[dimension];b++){
 | 
				
			||||||
 | 
						    _entries[point][lo+o+b]._offset  =ro+o+b;
 | 
				
			||||||
 | 
						    _entries[point][lo+o+b]._is_local=1;
 | 
				
			||||||
 | 
						    _entries[point][lo+o+b]._permute=permute;
 | 
				
			||||||
 | 
						    _entries[point][lo+o+b]._around_the_world=wrap;
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						  o +=_grid->_slice_stride[dimension];
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
 | 
				
			||||||
 | 
						int lo  = lplane*_grid->_ostride[dimension]; // base offset for start of plane 
 | 
				
			||||||
 | 
						int o   = 0;                                     // relative offset to base within plane
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 | 
				
			||||||
 | 
						  for(int b=0;b<_grid->_slice_block[dimension];b++){
 | 
				
			||||||
 | 
						    
 | 
				
			||||||
 | 
						    int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
 | 
				
			||||||
 | 
						    
 | 
				
			||||||
 | 
						    if ( ocb&cbmask ) {
 | 
				
			||||||
 | 
						      _entries[point][lo+o+b]._offset =ro+o+b;
 | 
				
			||||||
 | 
						      _entries[point][lo+o+b]._is_local=1;
 | 
				
			||||||
 | 
						      _entries[point][lo+o+b]._permute=permute;
 | 
				
			||||||
 | 
						      _entries[point][lo+o+b]._around_the_world=wrap;
 | 
				
			||||||
 | 
						    }
 | 
				
			||||||
 | 
						    
 | 
				
			||||||
 | 
						    }
 | 
				
			||||||
 | 
						  o +=_grid->_slice_stride[dimension];
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  // Routine builds up integer table for each site in _offsets, _is_local, _permute
 | 
				
			||||||
 | 
					   void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      int rd = _grid->_rdimensions[dimension];
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      if ( !_grid->CheckerBoarded(dimension) ) {
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
 | 
				
			||||||
 | 
						int o   = 0;                                    // relative offset to base within plane
 | 
				
			||||||
 | 
						int bo  = 0;                                    // offset in buffer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						// Simple block stride gather of SIMD objects
 | 
				
			||||||
 | 
						for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 | 
				
			||||||
 | 
						  for(int b=0;b<_grid->_slice_block[dimension];b++){
 | 
				
			||||||
 | 
						    _entries[point][so+o+b]._offset  =offset+(bo++);
 | 
				
			||||||
 | 
						    _entries[point][so+o+b]._is_local=0;
 | 
				
			||||||
 | 
						    _entries[point][so+o+b]._permute=0;
 | 
				
			||||||
 | 
						    _entries[point][so+o+b]._around_the_world=wrap;
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						  o +=_grid->_slice_stride[dimension];
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      } else { 
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
 | 
				
			||||||
 | 
						int o   = 0;                                      // relative offset to base within plane
 | 
				
			||||||
 | 
						int bo  = 0;                                      // offset in buffer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 | 
				
			||||||
 | 
						  for(int b=0;b<_grid->_slice_block[dimension];b++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						    int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
				
			||||||
 | 
						    if ( ocb & cbmask ) {
 | 
				
			||||||
 | 
						      _entries[point][so+o+b]._offset  =offset+(bo++);
 | 
				
			||||||
 | 
						      _entries[point][so+o+b]._is_local=0;
 | 
				
			||||||
 | 
						      _entries[point][so+o+b]._permute =0;
 | 
				
			||||||
 | 
						      _entries[point][so+o+b]._around_the_world=wrap;
 | 
				
			||||||
 | 
						    }
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						  o +=_grid->_slice_stride[dimension];
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//      CartesianStencil(GridBase *grid,
 | 
				
			||||||
 | 
					//		       int npoints,
 | 
				
			||||||
 | 
					//		       int checkerboard,
 | 
				
			||||||
 | 
					//		       const std::vector<int> &directions,
 | 
				
			||||||
 | 
					//		       const std::vector<int> &distances);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      // Add to tables for various cases;  is this mistaken. only local if 1 proc in dim
 | 
					      // Add to tables for various cases;  is this mistaken. only local if 1 proc in dim
 | 
				
			||||||
      // Can this be avoided with simpler coding of comms?
 | 
					      // Can this be avoided with simpler coding of comms?
 | 
				
			||||||
      void Local     (int point, int dimension,int shift,int cbmask);
 | 
					   //      void Local     (int point, int dimension,int shift,int cbmask);
 | 
				
			||||||
      void Comms     (int point, int dimension,int shift,int cbmask);
 | 
					   //      void Comms     (int point, int dimension,int shift,int cbmask);
 | 
				
			||||||
      void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
 | 
					   //      void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap);
 | 
				
			||||||
      void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
 | 
					   //      void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      // Could allow a functional munging of the halo to another type during the comms.
 | 
					      // Could allow a functional munging of the halo to another type during the comms.
 | 
				
			||||||
      // this could implement the 16bit/32bit/64bit compression.
 | 
					      // this could implement the 16bit/32bit/64bit compression.
 | 
				
			||||||
      template<class vobj,class cobj, class compressor> void 
 | 
					      void HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
 | 
				
			||||||
	HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
 | 
					 | 
				
			||||||
      {
 | 
					      {
 | 
				
			||||||
	// conformable(source._grid,_grid);
 | 
						// conformable(source._grid,_grid);
 | 
				
			||||||
	assert(source._grid==_grid);
 | 
						assert(source._grid==_grid);
 | 
				
			||||||
 | 
						halotime-=usecond();
 | 
				
			||||||
	if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
 | 
						if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
 | 
				
			||||||
	int u_comm_offset=0;
 | 
						int u_comm_offset=0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -126,24 +431,33 @@ namespace Grid {
 | 
				
			|||||||
	    sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
 | 
						    sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
 | 
				
			||||||
	    if ( sshift[0] == sshift[1] ) {
 | 
						    if ( sshift[0] == sshift[1] ) {
 | 
				
			||||||
	      if (splice_dim) {
 | 
						      if (splice_dim) {
 | 
				
			||||||
 | 
							splicetime-=usecond();
 | 
				
			||||||
		GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
 | 
							GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
 | 
				
			||||||
 | 
							splicetime+=usecond();
 | 
				
			||||||
	      } else { 
 | 
						      } else { 
 | 
				
			||||||
 | 
							nosplicetime-=usecond();
 | 
				
			||||||
		GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
 | 
							GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress);
 | 
				
			||||||
 | 
							nosplicetime+=usecond();
 | 
				
			||||||
	      }
 | 
						      }
 | 
				
			||||||
	    } else {
 | 
						    } else {
 | 
				
			||||||
 | 
						      std::cout << "dim "<<dimension<<"cb "<<_checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
 | 
				
			||||||
	      if(splice_dim){
 | 
						      if(splice_dim){
 | 
				
			||||||
 | 
							splicetime-=usecond();
 | 
				
			||||||
		GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
 | 
							GatherStartCommsSimd(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);// if checkerboard is unfavourable take two passes
 | 
				
			||||||
		GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
 | 
							GatherStartCommsSimd(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);// both with block stride loop iteration
 | 
				
			||||||
 | 
							splicetime+=usecond();
 | 
				
			||||||
	      } else {
 | 
						      } else {
 | 
				
			||||||
 | 
							nosplicetime-=usecond();
 | 
				
			||||||
		GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
 | 
							GatherStartComms(source,dimension,shift,0x1,u_comm_buf,u_comm_offset,compress);
 | 
				
			||||||
		GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
 | 
							GatherStartComms(source,dimension,shift,0x2,u_comm_buf,u_comm_offset,compress);
 | 
				
			||||||
 | 
							nosplicetime+=usecond();
 | 
				
			||||||
	      }
 | 
						      }
 | 
				
			||||||
	    }
 | 
						    }
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
						halotime+=usecond();
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      template<class vobj,class cobj, class compressor> 
 | 
					 | 
				
			||||||
        void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
 | 
					        void GatherStartComms(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
 | 
				
			||||||
			      std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
 | 
								      std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
 | 
				
			||||||
			      int &u_comm_offset,compressor & compress)
 | 
								      int &u_comm_offset,compressor & compress)
 | 
				
			||||||
@@ -164,28 +478,29 @@ namespace Grid {
 | 
				
			|||||||
	  assert(comm_dim==1);
 | 
						  assert(comm_dim==1);
 | 
				
			||||||
	  assert(shift>=0);
 | 
						  assert(shift>=0);
 | 
				
			||||||
	  assert(shift<fd);
 | 
						  assert(shift<fd);
 | 
				
			||||||
	  
 | 
					
 | 
				
			||||||
	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 | 
						  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 | 
				
			||||||
	  
 | 
					
 | 
				
			||||||
	  std::vector<cobj,alignedAllocator<cobj> > send_buf(buffer_size); // hmm...
 | 
						  if(send_buf.size()<buffer_size) send_buf.resize(buffer_size);
 | 
				
			||||||
	  std::vector<cobj,alignedAllocator<cobj> > recv_buf(buffer_size);
 | 
					
 | 
				
			||||||
	  
 | 
					 | 
				
			||||||
	  int cb= (cbmask==0x2)? Odd : Even;
 | 
						  int cb= (cbmask==0x2)? Odd : Even;
 | 
				
			||||||
	  int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
 | 
						  int sshift= _grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
 | 
				
			||||||
	  
 | 
					
 | 
				
			||||||
	  for(int x=0;x<rd;x++){       
 | 
						  for(int x=0;x<rd;x++){       
 | 
				
			||||||
	    
 | 
					
 | 
				
			||||||
	    int sx        = (x+sshift)%rd;
 | 
						    int sx        = (x+sshift)%rd;
 | 
				
			||||||
	    int comm_proc = ((x+sshift)/rd)%pd;
 | 
						    int comm_proc = ((x+sshift)/rd)%pd;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    if (comm_proc) {
 | 
						    if (comm_proc) {
 | 
				
			||||||
	      
 | 
						      
 | 
				
			||||||
	      int words = send_buf.size();
 | 
						      int words = buffer_size;
 | 
				
			||||||
	      if (cbmask != 0x3) words=words>>1;
 | 
						      if (cbmask != 0x3) words=words>>1;
 | 
				
			||||||
	    
 | 
						    
 | 
				
			||||||
	      int bytes = words * sizeof(cobj);
 | 
						      int bytes = words * sizeof(cobj);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						      gathertime-=usecond();
 | 
				
			||||||
	      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
 | 
						      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress);
 | 
				
			||||||
 | 
						      gathertime+=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	      int rank           = _grid->_processor;
 | 
						      int rank           = _grid->_processor;
 | 
				
			||||||
	      int recv_from_rank;
 | 
						      int recv_from_rank;
 | 
				
			||||||
@@ -195,31 +510,27 @@ namespace Grid {
 | 
				
			|||||||
	      assert (recv_from_rank != _grid->ThisRank());
 | 
						      assert (recv_from_rank != _grid->ThisRank());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	      //      FIXME Implement asynchronous send & also avoid buffer copy
 | 
						      //      FIXME Implement asynchronous send & also avoid buffer copy
 | 
				
			||||||
 | 
						      commtime-=usecond();
 | 
				
			||||||
	      _grid->SendToRecvFrom((void *)&send_buf[0],
 | 
						      _grid->SendToRecvFrom((void *)&send_buf[0],
 | 
				
			||||||
				   xmit_to_rank,
 | 
									   xmit_to_rank,
 | 
				
			||||||
				   (void *)&recv_buf[0],
 | 
									    (void *)&u_comm_buf[u_comm_offset],
 | 
				
			||||||
				   recv_from_rank,
 | 
									   recv_from_rank,
 | 
				
			||||||
				   bytes);
 | 
									   bytes);
 | 
				
			||||||
 | 
						      commtime+=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	      for(int i=0;i<buffer_size;i++){
 | 
						      u_comm_offset+=words;
 | 
				
			||||||
		u_comm_buf[u_comm_offset+i]=recv_buf[i];
 | 
					 | 
				
			||||||
	      }
 | 
					 | 
				
			||||||
	      u_comm_offset+=buffer_size;
 | 
					 | 
				
			||||||
	    }
 | 
						    }
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      template<class vobj,class cobj, class compressor> 
 | 
					 | 
				
			||||||
	void  GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
 | 
						void  GatherStartCommsSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,
 | 
				
			||||||
				   std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
 | 
									   std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,
 | 
				
			||||||
				   int &u_comm_offset,compressor &compress)
 | 
									   int &u_comm_offset,compressor &compress)
 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
 | 
						  buftime-=usecond();
 | 
				
			||||||
	  const int Nsimd = _grid->Nsimd();
 | 
						  const int Nsimd = _grid->Nsimd();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  typedef typename cobj::vector_type vector_type;
 | 
					 | 
				
			||||||
	  typedef typename cobj::scalar_type scalar_type;
 | 
					 | 
				
			||||||
	  typedef typename cobj::scalar_object scalar_object;
 | 
					 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
	  int fd = _grid->_fdimensions[dimension];
 | 
						  int fd = _grid->_fdimensions[dimension];
 | 
				
			||||||
	  int rd = _grid->_rdimensions[dimension];
 | 
						  int rd = _grid->_rdimensions[dimension];
 | 
				
			||||||
@@ -241,17 +552,23 @@ namespace Grid {
 | 
				
			|||||||
	  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 | 
						  int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 | 
				
			||||||
	  int words = sizeof(cobj)/sizeof(vector_type);
 | 
						  int words = sizeof(cobj)/sizeof(vector_type);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  /*
 | 
						  assert(cbmask==0x3); // Fixme think there is a latent bug if not true
 | 
				
			||||||
	   * possibly slow to allocate
 | 
					 | 
				
			||||||
	   * Doesn't matter in this test, but may want to preallocate in the 
 | 
					 | 
				
			||||||
	   * dirac operators
 | 
					 | 
				
			||||||
	   */
 | 
					 | 
				
			||||||
	  std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) ); 
 | 
					 | 
				
			||||||
	  std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
 | 
					 | 
				
			||||||
	  int bytes = buffer_size*sizeof(scalar_object);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  std::vector<scalar_object *> pointers(Nsimd);  //
 | 
						  //	Should grow to max size and then cost very little thereafter
 | 
				
			||||||
	  std::vector<scalar_object *> rpointers(Nsimd); // received pointers
 | 
						  send_buf_extract.resize(Nsimd);
 | 
				
			||||||
 | 
						  recv_buf_extract.resize(Nsimd);
 | 
				
			||||||
 | 
						  for(int l=0;l<Nsimd;l++){
 | 
				
			||||||
 | 
						    if( send_buf_extract[l].size() < buffer_size) {
 | 
				
			||||||
 | 
						      send_buf_extract[l].resize(buffer_size);
 | 
				
			||||||
 | 
						      recv_buf_extract[l].resize(buffer_size);
 | 
				
			||||||
 | 
						    }
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						  pointers.resize(Nsimd);
 | 
				
			||||||
 | 
						  rpointers.resize(Nsimd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  int bytes = buffer_size*sizeof(scalar_object);
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  buftime+=usecond();
 | 
				
			||||||
	  
 | 
						  
 | 
				
			||||||
	  ///////////////////////////////////////////
 | 
						  ///////////////////////////////////////////
 | 
				
			||||||
	  // Work out what to send where
 | 
						  // Work out what to send where
 | 
				
			||||||
@@ -272,7 +589,9 @@ namespace Grid {
 | 
				
			|||||||
	      }
 | 
						      }
 | 
				
			||||||
	      int sx   = (x+sshift)%rd;
 | 
						      int sx   = (x+sshift)%rd;
 | 
				
			||||||
	      
 | 
						      
 | 
				
			||||||
 | 
						      gathermtime-=usecond();
 | 
				
			||||||
	      Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
 | 
						      Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
 | 
				
			||||||
 | 
						      gathermtime+=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	      for(int i=0;i<Nsimd;i++){
 | 
						      for(int i=0;i<Nsimd;i++){
 | 
				
			||||||
		
 | 
							
 | 
				
			||||||
@@ -299,11 +618,13 @@ namespace Grid {
 | 
				
			|||||||
		  
 | 
							  
 | 
				
			||||||
		  _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 | 
							  _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
 | 
				
			||||||
		  
 | 
							  
 | 
				
			||||||
 | 
							  commstime-=usecond();
 | 
				
			||||||
		  _grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
 | 
							  _grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
 | 
				
			||||||
					xmit_to_rank,
 | 
										xmit_to_rank,
 | 
				
			||||||
					(void *)&recv_buf_extract[i][0],
 | 
										(void *)&recv_buf_extract[i][0],
 | 
				
			||||||
					recv_from_rank,
 | 
										recv_from_rank,
 | 
				
			||||||
					bytes);
 | 
										bytes);
 | 
				
			||||||
 | 
							  commstime+=usecond();
 | 
				
			||||||
		  
 | 
							  
 | 
				
			||||||
		  rpointers[i] = &recv_buf_extract[i][0];
 | 
							  rpointers[i] = &recv_buf_extract[i][0];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -313,11 +634,13 @@ namespace Grid {
 | 
				
			|||||||
	      }
 | 
						      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	      // Here we don't want to scatter, just place into a buffer.
 | 
						      // Here we don't want to scatter, just place into a buffer.
 | 
				
			||||||
 | 
						      mergetime-=usecond();
 | 
				
			||||||
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
	      for(int i=0;i<buffer_size;i++){
 | 
						      for(int i=0;i<buffer_size;i++){
 | 
				
			||||||
		assert(u_comm_offset+i<_unified_buffer_size);
 | 
							//		assert(u_comm_offset+i<_unified_buffer_size);
 | 
				
			||||||
		merge(u_comm_buf[u_comm_offset+i],rpointers,i);
 | 
							merge(u_comm_buf[u_comm_offset+i],rpointers,i);
 | 
				
			||||||
	      }
 | 
						      }
 | 
				
			||||||
 | 
						      mergetime+=usecond();
 | 
				
			||||||
	      u_comm_offset+=buffer_size;
 | 
						      u_comm_offset+=buffer_size;
 | 
				
			||||||
	    }
 | 
						    }
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -24,7 +24,16 @@ namespace Grid {
 | 
				
			|||||||
class GridThread {
 | 
					class GridThread {
 | 
				
			||||||
 public:
 | 
					 public:
 | 
				
			||||||
  static int _threads;
 | 
					  static int _threads;
 | 
				
			||||||
 | 
					  static int _hyperthreads;
 | 
				
			||||||
 | 
					  static int _cores;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  static void SetCores(int cr) { 
 | 
				
			||||||
 | 
					#ifdef GRID_OMP
 | 
				
			||||||
 | 
					    _cores = cr;
 | 
				
			||||||
 | 
					#else 
 | 
				
			||||||
 | 
					    _cores = 1;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
  static void SetThreads(int thr) { 
 | 
					  static void SetThreads(int thr) { 
 | 
				
			||||||
#ifdef GRID_OMP
 | 
					#ifdef GRID_OMP
 | 
				
			||||||
    _threads = MIN(thr,omp_get_max_threads()) ;
 | 
					    _threads = MIN(thr,omp_get_max_threads()) ;
 | 
				
			||||||
@@ -35,22 +44,28 @@ class GridThread {
 | 
				
			|||||||
  };
 | 
					  };
 | 
				
			||||||
  static void SetMaxThreads(void) { 
 | 
					  static void SetMaxThreads(void) { 
 | 
				
			||||||
#ifdef GRID_OMP
 | 
					#ifdef GRID_OMP
 | 
				
			||||||
 | 
					    setenv("KMP_AFFINITY","balanced",1);
 | 
				
			||||||
    _threads = omp_get_max_threads();
 | 
					    _threads = omp_get_max_threads();
 | 
				
			||||||
    omp_set_num_threads(_threads);
 | 
					    omp_set_num_threads(_threads);
 | 
				
			||||||
#else 
 | 
					#else 
 | 
				
			||||||
    _threads = 1;
 | 
					    _threads = 1;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					  static int GetHyperThreads(void) { assert(_threads%_cores ==0); return _threads/_cores; };
 | 
				
			||||||
 | 
					  static int GetCores(void)   { return _cores; };
 | 
				
			||||||
  static int GetThreads(void) { return _threads; };
 | 
					  static int GetThreads(void) { return _threads; };
 | 
				
			||||||
  static int SumArraySize(void) {return _threads;};
 | 
					  static int SumArraySize(void) {return _threads;};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  static void GetWork(int nwork, int me, int & mywork, int & myoff){
 | 
					  static void GetWork(int nwork, int me, int & mywork, int & myoff){
 | 
				
			||||||
    int basework = nwork/_threads;
 | 
					    GetWork(nwork,me,mywork,myoff,_threads);
 | 
				
			||||||
    int backfill = _threads-(nwork%_threads);
 | 
					  }
 | 
				
			||||||
    if ( me >= _threads ) { 
 | 
					  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
 | 
				
			||||||
 | 
					    int basework = nwork/units;
 | 
				
			||||||
 | 
					    int backfill = units-(nwork%units);
 | 
				
			||||||
 | 
					    if ( me >= units ) { 
 | 
				
			||||||
      mywork = myoff = 0;
 | 
					      mywork = myoff = 0;
 | 
				
			||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
      mywork = (nwork+me)/_threads;
 | 
					      mywork = (nwork+me)/units;
 | 
				
			||||||
      myoff  = basework * me;
 | 
					      myoff  = basework * me;
 | 
				
			||||||
      if ( me > backfill ) 
 | 
					      if ( me > backfill ) 
 | 
				
			||||||
	myoff+= (me-backfill);
 | 
						myoff+= (me-backfill);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -170,7 +170,7 @@ namespace Grid {
 | 
				
			|||||||
    ////////////////////
 | 
					    ////////////////////
 | 
				
			||||||
    Geometry         geom;
 | 
					    Geometry         geom;
 | 
				
			||||||
    GridBase *       _grid; 
 | 
					    GridBase *       _grid; 
 | 
				
			||||||
    CartesianStencil Stencil; 
 | 
					    CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    std::vector<CoarseMatrix> A;
 | 
					    std::vector<CoarseMatrix> A;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -9,23 +9,34 @@ namespace Grid {
 | 
				
			|||||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
  // Simple general polynomial with user supplied coefficients
 | 
					  // Simple general polynomial with user supplied coefficients
 | 
				
			||||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
					  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					  template<class Field>
 | 
				
			||||||
 | 
					  class HermOpOperatorFunction : public OperatorFunction<Field> {
 | 
				
			||||||
 | 
					    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
 | 
				
			||||||
 | 
					      Linop.HermOp(in,out);
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  template<class Field>
 | 
					  template<class Field>
 | 
				
			||||||
  class Polynomial : public OperatorFunction<Field> {
 | 
					  class Polynomial : public OperatorFunction<Field> {
 | 
				
			||||||
  private:
 | 
					  private:
 | 
				
			||||||
    std::vector<double> Coeffs;
 | 
					    std::vector<RealD> Coeffs;
 | 
				
			||||||
  public:
 | 
					  public:
 | 
				
			||||||
    Polynomial(std::vector<double> &_Coeffs) : Coeffs(_Coeffs) {};
 | 
					    Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Implement the required interface
 | 
					    // Implement the required interface
 | 
				
			||||||
    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
 | 
					    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      Field AtoN = in;
 | 
					      Field AtoN(in._grid);
 | 
				
			||||||
 | 
					      Field Mtmp(in._grid);
 | 
				
			||||||
 | 
					      AtoN = in;
 | 
				
			||||||
      out = AtoN*Coeffs[0];
 | 
					      out = AtoN*Coeffs[0];
 | 
				
			||||||
 | 
					      //      std::cout <<"Poly in " <<norm2(in)<<std::endl;
 | 
				
			||||||
 | 
					      //      std::cout <<"0 " <<norm2(out)<<std::endl;
 | 
				
			||||||
      for(int n=1;n<Coeffs.size();n++){
 | 
					      for(int n=1;n<Coeffs.size();n++){
 | 
				
			||||||
	Field Mtmp=AtoN;
 | 
						Mtmp = AtoN;
 | 
				
			||||||
	Linop.Op(Mtmp,AtoN);
 | 
						Linop.HermOp(Mtmp,AtoN);
 | 
				
			||||||
	out=out+AtoN*Coeffs[n];
 | 
						out=out+AtoN*Coeffs[n];
 | 
				
			||||||
 | 
						//	std::cout << n<<" " <<norm2(out)<<std::endl;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
@@ -36,15 +47,15 @@ namespace Grid {
 | 
				
			|||||||
  template<class Field>
 | 
					  template<class Field>
 | 
				
			||||||
  class Chebyshev : public OperatorFunction<Field> {
 | 
					  class Chebyshev : public OperatorFunction<Field> {
 | 
				
			||||||
  private:
 | 
					  private:
 | 
				
			||||||
    std::vector<double> Coeffs;
 | 
					    std::vector<RealD> Coeffs;
 | 
				
			||||||
    int order;
 | 
					    int order;
 | 
				
			||||||
    double hi;
 | 
					    RealD hi;
 | 
				
			||||||
    double lo;
 | 
					    RealD lo;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  public:
 | 
					  public:
 | 
				
			||||||
    void csv(std::ostream &out){
 | 
					    void csv(std::ostream &out){
 | 
				
			||||||
      for (double x=lo; x<hi; x+=(hi-lo)/1000) {
 | 
					      for (RealD x=lo; x<hi; x+=(hi-lo)/1000) {
 | 
				
			||||||
	double f = approx(x);
 | 
						RealD f = approx(x);
 | 
				
			||||||
	out<< x<<" "<<f<<std::endl;
 | 
						out<< x<<" "<<f<<std::endl;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
      return;
 | 
					      return;
 | 
				
			||||||
@@ -53,15 +64,19 @@ namespace Grid {
 | 
				
			|||||||
    // Convenience for plotting the approximation
 | 
					    // Convenience for plotting the approximation
 | 
				
			||||||
    void   PlotApprox(std::ostream &out) {
 | 
					    void   PlotApprox(std::ostream &out) {
 | 
				
			||||||
      out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
 | 
					      out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
 | 
				
			||||||
      for(double x=lo;x<hi;x+=(hi-lo)/50.0){
 | 
					      for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){
 | 
				
			||||||
	out <<x<<"\t"<<approx(x)<<std::endl;
 | 
						out <<x<<"\t"<<approx(x)<<std::endl;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Chebyshev(){};
 | 
				
			||||||
 | 
					    Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
 | 
					    // c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
 | 
				
			||||||
    //
 | 
					    ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
    Chebyshev(double _lo,double _hi,int _order, double (* func)(double) ){
 | 
					    void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
      lo=_lo;
 | 
					      lo=_lo;
 | 
				
			||||||
      hi=_hi;
 | 
					      hi=_hi;
 | 
				
			||||||
      order=_order;
 | 
					      order=_order;
 | 
				
			||||||
@@ -69,24 +84,26 @@ namespace Grid {
 | 
				
			|||||||
      if(order < 2) exit(-1);
 | 
					      if(order < 2) exit(-1);
 | 
				
			||||||
      Coeffs.resize(order);
 | 
					      Coeffs.resize(order);
 | 
				
			||||||
      for(int j=0;j<order;j++){
 | 
					      for(int j=0;j<order;j++){
 | 
				
			||||||
	double s=0;
 | 
						RealD s=0;
 | 
				
			||||||
	for(int k=0;k<order;k++){
 | 
						for(int k=0;k<order;k++){
 | 
				
			||||||
	  double y=std::cos(M_PI*(k+0.5)/order);
 | 
						  RealD y=std::cos(M_PI*(k+0.5)/order);
 | 
				
			||||||
	  double x=0.5*(y*(hi-lo)+(hi+lo));
 | 
						  RealD x=0.5*(y*(hi-lo)+(hi+lo));
 | 
				
			||||||
	  double f=func(x);
 | 
						  RealD f=func(x);
 | 
				
			||||||
	  s=s+f*std::cos( j*M_PI*(k+0.5)/order );
 | 
						  s=s+f*std::cos( j*M_PI*(k+0.5)/order );
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	Coeffs[j] = s * 2.0/order;
 | 
						Coeffs[j] = s * 2.0/order;
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    void JacksonSmooth(void){
 | 
					    void JacksonSmooth(void){
 | 
				
			||||||
      double M=order;
 | 
					      RealD M=order;
 | 
				
			||||||
      double alpha = M_PI/(M+2);
 | 
					      RealD alpha = M_PI/(M+2);
 | 
				
			||||||
      double lmax = std::cos(alpha);
 | 
					      RealD lmax = std::cos(alpha);
 | 
				
			||||||
      double sumUsq =0;
 | 
					      RealD sumUsq =0;
 | 
				
			||||||
      std::vector<double> U(M);
 | 
					      std::vector<RealD> U(M);
 | 
				
			||||||
      std::vector<double> a(M);
 | 
					      std::vector<RealD> a(M);
 | 
				
			||||||
      std::vector<double> g(M);
 | 
					      std::vector<RealD> g(M);
 | 
				
			||||||
      for(int n=0;n<=M;n++){
 | 
					      for(int n=0;n<=M;n++){
 | 
				
			||||||
	U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
 | 
						U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
 | 
				
			||||||
	sumUsq += U[n]*U[n];
 | 
						sumUsq += U[n]*U[n];
 | 
				
			||||||
@@ -107,18 +124,18 @@ namespace Grid {
 | 
				
			|||||||
	Coeffs[m]*=g[m];
 | 
						Coeffs[m]*=g[m];
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    double approx(double x) // Convenience for plotting the approximation
 | 
					    RealD approx(RealD x) // Convenience for plotting the approximation
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
      double Tn;
 | 
					      RealD Tn;
 | 
				
			||||||
      double Tnm;
 | 
					      RealD Tnm;
 | 
				
			||||||
      double Tnp;
 | 
					      RealD Tnp;
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
      double y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
 | 
					      RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
      double T0=1;
 | 
					      RealD T0=1;
 | 
				
			||||||
      double T1=y;
 | 
					      RealD T1=y;
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
      double sum;
 | 
					      RealD sum;
 | 
				
			||||||
      sum = 0.5*Coeffs[0]*T0;
 | 
					      sum = 0.5*Coeffs[0]*T0;
 | 
				
			||||||
      sum+= Coeffs[1]*T1;
 | 
					      sum+= Coeffs[1]*T1;
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
@@ -151,8 +168,8 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      std::cout<<GridLogMessage << "Chebyshev ["<<lo<<","<<hi<<"]"<< " order "<<order <<std::endl;
 | 
					      std::cout<<GridLogMessage << "Chebyshev ["<<lo<<","<<hi<<"]"<< " order "<<order <<std::endl;
 | 
				
			||||||
      // Tn=T1 = (xscale M + mscale)in
 | 
					      // Tn=T1 = (xscale M + mscale)in
 | 
				
			||||||
      double xscale = 2.0/(hi-lo);
 | 
					      RealD xscale = 2.0/(hi-lo);
 | 
				
			||||||
      double mscale = -(hi+lo)/(hi-lo);
 | 
					      RealD mscale = -(hi+lo)/(hi-lo);
 | 
				
			||||||
      Linop.HermOp(T0,y);
 | 
					      Linop.HermOp(T0,y);
 | 
				
			||||||
      T1=y*xscale+in*mscale;
 | 
					      T1=y*xscale+in*mscale;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -179,5 +196,121 @@ namespace Grid {
 | 
				
			|||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  template<class Field>
 | 
				
			||||||
 | 
					  class ChebyshevLanczos : public Chebyshev<Field> {
 | 
				
			||||||
 | 
					  private:
 | 
				
			||||||
 | 
					    std::vector<RealD> Coeffs;
 | 
				
			||||||
 | 
					    int order;
 | 
				
			||||||
 | 
					    RealD alpha;
 | 
				
			||||||
 | 
					    RealD beta;
 | 
				
			||||||
 | 
					    RealD mu;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  public:
 | 
				
			||||||
 | 
					    ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) :
 | 
				
			||||||
 | 
					    alpha(_alpha),
 | 
				
			||||||
 | 
					      beta(_beta),
 | 
				
			||||||
 | 
					          mu(_mu)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      order=_order;
 | 
				
			||||||
 | 
					      Coeffs.resize(order);
 | 
				
			||||||
 | 
					      for(int i=0;i<_order;i++){
 | 
				
			||||||
 | 
						Coeffs[i] = 0.0;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      Coeffs[order-1]=1.0;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    void csv(std::ostream &out){
 | 
				
			||||||
 | 
					      for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) {
 | 
				
			||||||
 | 
						RealD f = approx(x);
 | 
				
			||||||
 | 
						out<< x<<" "<<f<<std::endl;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      return;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    RealD approx(RealD xx) // Convenience for plotting the approximation
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      RealD Tn;
 | 
				
			||||||
 | 
					      RealD Tnm;
 | 
				
			||||||
 | 
					      RealD Tnp;
 | 
				
			||||||
 | 
					      Real aa = alpha * alpha;
 | 
				
			||||||
 | 
					      Real bb = beta  *  beta;
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      RealD y= x;
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      RealD T0=1;
 | 
				
			||||||
 | 
					      RealD T1=y;
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      RealD sum;
 | 
				
			||||||
 | 
					      sum = 0.5*Coeffs[0]*T0;
 | 
				
			||||||
 | 
					      sum+= Coeffs[1]*T1;
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      Tn =T1;
 | 
				
			||||||
 | 
					      Tnm=T0;
 | 
				
			||||||
 | 
					      for(int i=2;i<order;i++){
 | 
				
			||||||
 | 
						Tnp=2*y*Tn-Tnm;
 | 
				
			||||||
 | 
						Tnm=Tn;
 | 
				
			||||||
 | 
						Tn =Tnp;
 | 
				
			||||||
 | 
						sum+= Tn*Coeffs[i];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      return sum;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // shift_Multiply in Rudy's code
 | 
				
			||||||
 | 
					    void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out) 
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      GridBase *grid=in._grid;
 | 
				
			||||||
 | 
					      Field tmp(grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      RealD aa= alpha*alpha;
 | 
				
			||||||
 | 
					      RealD bb= beta * beta;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      Linop.HermOp(in,out);
 | 
				
			||||||
 | 
					      out = out - mu*in;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      Linop.HermOp(out,tmp);
 | 
				
			||||||
 | 
					      tmp = tmp - mu * out;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      out = (2.0/ (aa-bb) ) * tmp -  ((aa+bb)/(aa-bb))*in;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					    // Implement the required interface
 | 
				
			||||||
 | 
					    void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      GridBase *grid=in._grid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      int vol=grid->gSites();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      Field T0(grid); T0 = in;  
 | 
				
			||||||
 | 
					      Field T1(grid); 
 | 
				
			||||||
 | 
					      Field T2(grid);
 | 
				
			||||||
 | 
					      Field  y(grid);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      Field *Tnm = &T0;
 | 
				
			||||||
 | 
					      Field *Tn  = &T1;
 | 
				
			||||||
 | 
					      Field *Tnp = &T2;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Tn=T1 = (xscale M )*in
 | 
				
			||||||
 | 
					      AminusMuSq(Linop,T0,T1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // sum = .5 c[0] T0 + c[1] T1
 | 
				
			||||||
 | 
					      out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
 | 
				
			||||||
 | 
					      for(int n=2;n<order;n++){
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						AminusMuSq(Linop,*Tn,y);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						*Tnp=2.0*y-(*Tnm);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						out=out+Coeffs[n]* (*Tnp);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						// Cycle pointers to avoid copies
 | 
				
			||||||
 | 
						Field *swizzle = Tnm;
 | 
				
			||||||
 | 
						Tnm    =Tn;
 | 
				
			||||||
 | 
						Tn     =Tnp;
 | 
				
			||||||
 | 
						Tnp    =swizzle;
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										106
									
								
								lib/algorithms/iterative/DenseMatrix.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								lib/algorithms/iterative/DenseMatrix.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,106 @@
 | 
				
			|||||||
 | 
					#ifndef GRID_DENSE_MATRIX_H
 | 
				
			||||||
 | 
					#define GRID_DENSE_MATRIX_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					    /////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					    // Matrix untils
 | 
				
			||||||
 | 
					    /////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class T> using DenseVector = std::vector<T>;
 | 
				
			||||||
 | 
					template<class T> using DenseMatrix = DenseVector<DenseVector<T> >;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class T> void Size(DenseVector<T> & vec, int &N) 
 | 
				
			||||||
 | 
					{ 
 | 
				
			||||||
 | 
					  N= vec.size();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					template<class T> void Size(DenseMatrix<T> & mat, int &N,int &M) 
 | 
				
			||||||
 | 
					{ 
 | 
				
			||||||
 | 
					  N= mat.size();
 | 
				
			||||||
 | 
					  M= mat[0].size();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class T> void SizeSquare(DenseMatrix<T> & mat, int &N) 
 | 
				
			||||||
 | 
					{ 
 | 
				
			||||||
 | 
					  int M; Size(mat,N,M);
 | 
				
			||||||
 | 
					  assert(N==M);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class T> void Resize(DenseMatrix<T > & mat, int N, int M) { 
 | 
				
			||||||
 | 
					  mat.resize(N);
 | 
				
			||||||
 | 
					  for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
					    mat[i].resize(M);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					template<class T> void Fill(DenseMatrix<T> & mat, T&val) { 
 | 
				
			||||||
 | 
					  int N,M;
 | 
				
			||||||
 | 
					  Size(mat,N,M);
 | 
				
			||||||
 | 
					  for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
					  for(int j=0;j<M;j++){
 | 
				
			||||||
 | 
					    mat[i][j] = val;
 | 
				
			||||||
 | 
					  }}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** Transpose of a matrix **/
 | 
				
			||||||
 | 
					template<class T> DenseMatrix<T> Transpose(DenseMatrix<T> & mat){
 | 
				
			||||||
 | 
					  int N,M;
 | 
				
			||||||
 | 
					  Size(mat,N,M);
 | 
				
			||||||
 | 
					  DenseMatrix<T> C; Resize(C,M,N);
 | 
				
			||||||
 | 
					  for(int i=0;i<M;i++){
 | 
				
			||||||
 | 
					  for(int j=0;j<N;j++){
 | 
				
			||||||
 | 
					    C[i][j] = mat[j][i];
 | 
				
			||||||
 | 
					  }} 
 | 
				
			||||||
 | 
					  return C;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					/** Set DenseMatrix to unit matrix **/
 | 
				
			||||||
 | 
					template<class T> void Unity(DenseMatrix<T> &A){
 | 
				
			||||||
 | 
					  int N;  SizeSquare(A,N);
 | 
				
			||||||
 | 
					  for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
					    for(int j=0;j<N;j++){
 | 
				
			||||||
 | 
					      if ( i==j ) A[i][j] = 1;
 | 
				
			||||||
 | 
					      else        A[i][j] = 0;
 | 
				
			||||||
 | 
					    } 
 | 
				
			||||||
 | 
					  } 
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** Add C * I to matrix **/
 | 
				
			||||||
 | 
					template<class T>
 | 
				
			||||||
 | 
					void PlusUnit(DenseMatrix<T> & A,T c){
 | 
				
			||||||
 | 
					  int dim;  SizeSquare(A,dim);
 | 
				
			||||||
 | 
					  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** return the Hermitian conjugate of matrix **/
 | 
				
			||||||
 | 
					template<class T>
 | 
				
			||||||
 | 
					DenseMatrix<T> HermitianConj(DenseMatrix<T> &mat){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int dim; SizeSquare(mat,dim);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  DenseMatrix<T> C; Resize(C,dim,dim);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int i=0;i<dim;i++){
 | 
				
			||||||
 | 
					    for(int j=0;j<dim;j++){
 | 
				
			||||||
 | 
					      C[i][j] = conj(mat[j][i]);
 | 
				
			||||||
 | 
					    } 
 | 
				
			||||||
 | 
					  } 
 | 
				
			||||||
 | 
					  return C;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					/**Get a square submatrix**/
 | 
				
			||||||
 | 
					template <class T>
 | 
				
			||||||
 | 
					DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st, int col_end)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  DenseMatrix<T> H; Resize(H,row_end - row_st,col_end-col_st);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int i = row_st; i<row_end; i++){
 | 
				
			||||||
 | 
					  for(int j = col_st; j<col_end; j++){
 | 
				
			||||||
 | 
					    H[i-row_st][j-col_st]=A[i][j];
 | 
				
			||||||
 | 
					  }}
 | 
				
			||||||
 | 
					  return H;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <algorithms/iterative/Householder.h>
 | 
				
			||||||
 | 
					#include <algorithms/iterative/Francis.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
							
								
								
									
										52
									
								
								lib/algorithms/iterative/EigenSort.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								lib/algorithms/iterative/EigenSort.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,52 @@
 | 
				
			|||||||
 | 
					#ifndef GRID_EIGENSORT_H
 | 
				
			||||||
 | 
					#define GRID_EIGENSORT_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					    /////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					    // Eigen sorter to begin with
 | 
				
			||||||
 | 
					    /////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class Field>
 | 
				
			||||||
 | 
					class SortEigen {
 | 
				
			||||||
 | 
					 private:
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  static bool less_lmd(RealD left,RealD right){
 | 
				
			||||||
 | 
					    return fabs(left) < fabs(right);
 | 
				
			||||||
 | 
					  }  
 | 
				
			||||||
 | 
					  static bool less_pair(std::pair<RealD,Field>& left,
 | 
				
			||||||
 | 
							 std::pair<RealD,Field>& right){
 | 
				
			||||||
 | 
					    return fabs(left.first) < fabs(right.first);
 | 
				
			||||||
 | 
					  }  
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					 public:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  void push(DenseVector<RealD>& lmd,
 | 
				
			||||||
 | 
						    DenseVector<Field>& evec,int N) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    DenseVector<std::pair<RealD, Field> > emod;
 | 
				
			||||||
 | 
					    typename DenseVector<std::pair<RealD, Field> >::iterator it;
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    for(int i=0;i<lmd.size();++i){
 | 
				
			||||||
 | 
					      emod.push_back(std::pair<RealD,Field>(lmd[i],evec[i]));
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    it=emod.begin();
 | 
				
			||||||
 | 
					    for(int i=0;i<N;++i){
 | 
				
			||||||
 | 
					      lmd[i]=it->first;
 | 
				
			||||||
 | 
					      evec[i]=it->second;
 | 
				
			||||||
 | 
					      ++it;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  void push(DenseVector<RealD>& lmd,int N) {
 | 
				
			||||||
 | 
					    std::partial_sort(lmd.begin(),lmd.begin()+N,lmd.end(),less_lmd);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  bool saturated(RealD lmd, RealD thrs) {
 | 
				
			||||||
 | 
					    return fabs(lmd) > fabs(thrs);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
							
								
								
									
										498
									
								
								lib/algorithms/iterative/Francis.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										498
									
								
								lib/algorithms/iterative/Francis.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,498 @@
 | 
				
			|||||||
 | 
					#ifndef FRANCIS_H
 | 
				
			||||||
 | 
					#define FRANCIS_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <cstdlib>
 | 
				
			||||||
 | 
					#include <string>
 | 
				
			||||||
 | 
					#include <cmath>
 | 
				
			||||||
 | 
					#include <iostream>
 | 
				
			||||||
 | 
					#include <sstream>
 | 
				
			||||||
 | 
					#include <stdexcept>
 | 
				
			||||||
 | 
					#include <fstream>
 | 
				
			||||||
 | 
					#include <complex>
 | 
				
			||||||
 | 
					#include <algorithm>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//#include <timer.h>
 | 
				
			||||||
 | 
					//#include <lapacke.h>
 | 
				
			||||||
 | 
					//#include <Eigen/Dense>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <class T> int SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
 | 
				
			||||||
 | 
					template <class T> int     Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					  Find the eigenvalues of an upper hessenberg matrix using the Francis QR algorithm.
 | 
				
			||||||
 | 
					H =
 | 
				
			||||||
 | 
					      x  x  x  x  x  x  x  x  x
 | 
				
			||||||
 | 
					      x  x  x  x  x  x  x  x  x
 | 
				
			||||||
 | 
					      0  x  x  x  x  x  x  x  x
 | 
				
			||||||
 | 
					      0  0  x  x  x  x  x  x  x
 | 
				
			||||||
 | 
					      0  0  0  x  x  x  x  x  x
 | 
				
			||||||
 | 
					      0  0  0  0  x  x  x  x  x
 | 
				
			||||||
 | 
					      0  0  0  0  0  x  x  x  x
 | 
				
			||||||
 | 
					      0  0  0  0  0  0  x  x  x
 | 
				
			||||||
 | 
					      0  0  0  0  0  0  0  x  x
 | 
				
			||||||
 | 
					Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.
 | 
				
			||||||
 | 
					**/
 | 
				
			||||||
 | 
					template <class T>
 | 
				
			||||||
 | 
					int QReigensystem(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  DenseMatrix<T> H = Hin; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int N ; SizeSquare(H,N);
 | 
				
			||||||
 | 
					  int M = N;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Fill(evals,0);
 | 
				
			||||||
 | 
					  Fill(evecs,0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  T s,t,x=0,y=0,z=0;
 | 
				
			||||||
 | 
					  T u,d;
 | 
				
			||||||
 | 
					  T apd,amd,bc;
 | 
				
			||||||
 | 
					  DenseVector<T> p(N,0);
 | 
				
			||||||
 | 
					  T nrm = Norm(H);    ///DenseMatrix Norm
 | 
				
			||||||
 | 
					  int n, m;
 | 
				
			||||||
 | 
					  int e = 0;
 | 
				
			||||||
 | 
					  int it = 0;
 | 
				
			||||||
 | 
					  int tot_it = 0;
 | 
				
			||||||
 | 
					  int l = 0;
 | 
				
			||||||
 | 
					  int r = 0;
 | 
				
			||||||
 | 
					  DenseMatrix<T> P; Resize(P,N,N); Unity(P);
 | 
				
			||||||
 | 
					  DenseVector<int> trows(N,0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /// Check if the matrix is really hessenberg, if not abort
 | 
				
			||||||
 | 
					  RealD sth = 0;
 | 
				
			||||||
 | 
					  for(int j=0;j<N;j++){
 | 
				
			||||||
 | 
					    for(int i=j+2;i<N;i++){
 | 
				
			||||||
 | 
					      sth = abs(H[i][j]);
 | 
				
			||||||
 | 
					      if(sth > small){
 | 
				
			||||||
 | 
						std::cout << "Non hessenberg H = " << sth << " > " << small << std::endl;
 | 
				
			||||||
 | 
						exit(1);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  do{
 | 
				
			||||||
 | 
					    std::cout << "Francis QR Step N = " << N << std::endl;
 | 
				
			||||||
 | 
					    /** Check for convergence
 | 
				
			||||||
 | 
					      x  x  x  x  x
 | 
				
			||||||
 | 
					      0  x  x  x  x
 | 
				
			||||||
 | 
					      0  0  x  x  x
 | 
				
			||||||
 | 
					      0  0  x  x  x
 | 
				
			||||||
 | 
					      0  0  0  0  x
 | 
				
			||||||
 | 
					      for this matrix l = 4
 | 
				
			||||||
 | 
					     **/
 | 
				
			||||||
 | 
					    do{
 | 
				
			||||||
 | 
					      l = Chop_subdiag(H,nrm,e,small);
 | 
				
			||||||
 | 
					      r = 0;    ///May have converged on more than one eval
 | 
				
			||||||
 | 
					      ///Single eval
 | 
				
			||||||
 | 
					      if(l == N-1){
 | 
				
			||||||
 | 
					        evals[e] = H[l][l];
 | 
				
			||||||
 | 
					        N--; e++; r++; it = 0;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      ///RealD eval
 | 
				
			||||||
 | 
					      if(l == N-2){
 | 
				
			||||||
 | 
					        trows[l+1] = 1;    ///Needed for UTSolve
 | 
				
			||||||
 | 
					        apd = H[l][l] + H[l+1][l+1];
 | 
				
			||||||
 | 
					        amd = H[l][l] - H[l+1][l+1];
 | 
				
			||||||
 | 
					        bc =  (T)4.0*H[l+1][l]*H[l][l+1];
 | 
				
			||||||
 | 
					        evals[e]   = (T)0.5*( apd + sqrt(amd*amd + bc) );
 | 
				
			||||||
 | 
					        evals[e+1] = (T)0.5*( apd - sqrt(amd*amd + bc) );
 | 
				
			||||||
 | 
					        N-=2; e+=2; r++; it = 0;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    } while(r>0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if(N ==0) break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    DenseVector<T > ck; Resize(ck,3);
 | 
				
			||||||
 | 
					    DenseVector<T> v;   Resize(v,3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for(int m = N-3; m >= l; m--){
 | 
				
			||||||
 | 
					      ///Starting vector essentially random shift.
 | 
				
			||||||
 | 
					      if(it%10 == 0 && N >= 3 && it > 0){
 | 
				
			||||||
 | 
					        s = (T)1.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
 | 
				
			||||||
 | 
					        t = (T)0.618033989*( abs( H[N-1][N-2] ) + abs( H[N-2][N-3] ) );
 | 
				
			||||||
 | 
					        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
 | 
				
			||||||
 | 
					        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
 | 
				
			||||||
 | 
					        z = H[m+1][m]*H[m+2][m+1];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      ///Starting vector implicit Q theorem
 | 
				
			||||||
 | 
					      else{
 | 
				
			||||||
 | 
					        s = (H[N-2][N-2] + H[N-1][N-1]);
 | 
				
			||||||
 | 
					        t = (H[N-2][N-2]*H[N-1][N-1] - H[N-2][N-1]*H[N-1][N-2]);
 | 
				
			||||||
 | 
					        x = H[m][m]*H[m][m] + H[m][m+1]*H[m+1][m] - s*H[m][m] + t;
 | 
				
			||||||
 | 
					        y = H[m+1][m]*(H[m][m] + H[m+1][m+1] - s);
 | 
				
			||||||
 | 
					        z = H[m+1][m]*H[m+2][m+1];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      ck[0] = x; ck[1] = y; ck[2] = z;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      if(m == l) break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      /** Some stupid thing from numerical recipies, seems to work**/
 | 
				
			||||||
 | 
					      // PAB.. for heaven's sake quote page, purpose, evidence it works.
 | 
				
			||||||
 | 
					      //       what sort of comment is that!?!?!?
 | 
				
			||||||
 | 
					      u=abs(H[m][m-1])*(abs(y)+abs(z));
 | 
				
			||||||
 | 
					      d=abs(x)*(abs(H[m-1][m-1])+abs(H[m][m])+abs(H[m+1][m+1]));
 | 
				
			||||||
 | 
					      if ((T)abs(u+d) == (T)abs(d) ){
 | 
				
			||||||
 | 
						l = m; break;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      //if (u < small){l = m; break;}
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if(it > 100000){
 | 
				
			||||||
 | 
					     std::cout << "QReigensystem: bugger it got stuck after 100000 iterations" << std::endl;
 | 
				
			||||||
 | 
					     std::cout << "got " << e << " evals " << l << " " << N << std::endl;
 | 
				
			||||||
 | 
					      exit(1);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    normalize(ck);    ///Normalization cancels in PHP anyway
 | 
				
			||||||
 | 
					    T beta;
 | 
				
			||||||
 | 
					    Householder_vector<T >(ck, 0, 2, v, beta);
 | 
				
			||||||
 | 
					    Householder_mult<T >(H,v,beta,0,l,l+2,0);
 | 
				
			||||||
 | 
					    Householder_mult<T >(H,v,beta,0,l,l+2,1);
 | 
				
			||||||
 | 
					    ///Accumulate eigenvector
 | 
				
			||||||
 | 
					    Householder_mult<T >(P,v,beta,0,l,l+2,1);
 | 
				
			||||||
 | 
					    int sw = 0;      ///Are we on the last row?
 | 
				
			||||||
 | 
					    for(int k=l;k<N-2;k++){
 | 
				
			||||||
 | 
					      x = H[k+1][k];
 | 
				
			||||||
 | 
					      y = H[k+2][k];
 | 
				
			||||||
 | 
					      z = (T)0.0;
 | 
				
			||||||
 | 
					      if(k+3 <= N-1){
 | 
				
			||||||
 | 
						z = H[k+3][k];
 | 
				
			||||||
 | 
					      } else{
 | 
				
			||||||
 | 
						sw = 1; 
 | 
				
			||||||
 | 
						v[2] = (T)0.0;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      ck[0] = x; ck[1] = y; ck[2] = z;
 | 
				
			||||||
 | 
					      normalize(ck);
 | 
				
			||||||
 | 
					      Householder_vector<T >(ck, 0, 2-sw, v, beta);
 | 
				
			||||||
 | 
					      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,0);
 | 
				
			||||||
 | 
					      Householder_mult<T >(H,v, beta,0,k+1,k+3-sw,1);
 | 
				
			||||||
 | 
					      ///Accumulate eigenvector
 | 
				
			||||||
 | 
					      Householder_mult<T >(P,v, beta,0,k+1,k+3-sw,1);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    it++;
 | 
				
			||||||
 | 
					    tot_it++;
 | 
				
			||||||
 | 
					  }while(N > 1);
 | 
				
			||||||
 | 
					  N = evals.size();
 | 
				
			||||||
 | 
					  ///Annoying - UT solves in reverse order;
 | 
				
			||||||
 | 
					  DenseVector<T> tmp; Resize(tmp,N);
 | 
				
			||||||
 | 
					  for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
					    tmp[i] = evals[N-i-1];
 | 
				
			||||||
 | 
					  } 
 | 
				
			||||||
 | 
					  evals = tmp;
 | 
				
			||||||
 | 
					  UTeigenvectors(H, trows, evals, evecs);
 | 
				
			||||||
 | 
					  for(int i=0;i<evals.size();i++){evecs[i] = P*evecs[i]; normalize(evecs[i]);}
 | 
				
			||||||
 | 
					  return tot_it;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <class T>
 | 
				
			||||||
 | 
					int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  /**
 | 
				
			||||||
 | 
					  Find the eigenvalues of an upper Hessenberg matrix using the Wilkinson QR algorithm.
 | 
				
			||||||
 | 
					  H =
 | 
				
			||||||
 | 
					  x  x  0  0  0  0
 | 
				
			||||||
 | 
					  x  x  x  0  0  0
 | 
				
			||||||
 | 
					  0  x  x  x  0  0
 | 
				
			||||||
 | 
					  0  0  x  x  x  0
 | 
				
			||||||
 | 
					  0  0  0  x  x  x
 | 
				
			||||||
 | 
					  0  0  0  0  x  x
 | 
				
			||||||
 | 
					  Factorization is P T P^H where T is upper triangular (mod cc blocks) and P is orthagonal/unitary.  **/
 | 
				
			||||||
 | 
					  return my_Wilkinson(Hin, evals, evecs, small, small);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <class T>
 | 
				
			||||||
 | 
					int my_Wilkinson(DenseMatrix<T> &Hin, DenseVector<T> &evals, DenseMatrix<T> &evecs, RealD small, RealD tol)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int N; SizeSquare(Hin,N);
 | 
				
			||||||
 | 
					  int M = N;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ///I don't want to modify the input but matricies must be passed by reference
 | 
				
			||||||
 | 
					  //Scale a matrix by its "norm"
 | 
				
			||||||
 | 
					  //RealD Hnorm = abs( Hin.LargestDiag() ); H =  H*(1.0/Hnorm);
 | 
				
			||||||
 | 
					  DenseMatrix<T> H;  H = Hin;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  RealD Hnorm = abs(Norm(Hin));
 | 
				
			||||||
 | 
					  H = H * (1.0 / Hnorm);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // TODO use openmp and memset
 | 
				
			||||||
 | 
					  Fill(evals,0);
 | 
				
			||||||
 | 
					  Fill(evecs,0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  T s, t, x = 0, y = 0, z = 0;
 | 
				
			||||||
 | 
					  T u, d;
 | 
				
			||||||
 | 
					  T apd, amd, bc;
 | 
				
			||||||
 | 
					  DenseVector<T> p; Resize(p,N); Fill(p,0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  T nrm = Norm(H);    ///DenseMatrix Norm
 | 
				
			||||||
 | 
					  int n, m;
 | 
				
			||||||
 | 
					  int e = 0;
 | 
				
			||||||
 | 
					  int it = 0;
 | 
				
			||||||
 | 
					  int tot_it = 0;
 | 
				
			||||||
 | 
					  int l = 0;
 | 
				
			||||||
 | 
					  int r = 0;
 | 
				
			||||||
 | 
					  DenseMatrix<T> P; Resize(P,N,N);
 | 
				
			||||||
 | 
					  Unity(P);
 | 
				
			||||||
 | 
					  DenseVector<int> trows(N, 0);
 | 
				
			||||||
 | 
					  /// Check if the matrix is really symm tridiag
 | 
				
			||||||
 | 
					  RealD sth = 0;
 | 
				
			||||||
 | 
					  for(int j = 0; j < N; ++j)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    for(int i = j + 2; i < N; ++i)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      if(abs(H[i][j]) > tol || abs(H[j][i]) > tol)
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
						std::cout << "Non Tridiagonal H(" << i << ","<< j << ") = |" << Real( real( H[j][i] ) ) << "| > " << tol << std::endl;
 | 
				
			||||||
 | 
						std::cout << "Warning tridiagonalize and call again" << std::endl;
 | 
				
			||||||
 | 
					        // exit(1); // see what is going on
 | 
				
			||||||
 | 
					        //return;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  do{
 | 
				
			||||||
 | 
					    do{
 | 
				
			||||||
 | 
					      //Jasper
 | 
				
			||||||
 | 
					      //Check if the subdiagonal term is small enough (<small)
 | 
				
			||||||
 | 
					      //if true then it is converged.
 | 
				
			||||||
 | 
					      //check start from H.dim - e - 1
 | 
				
			||||||
 | 
					      //How to deal with more than 2 are converged?
 | 
				
			||||||
 | 
					      //What if Chop_symm_subdiag return something int the middle?
 | 
				
			||||||
 | 
					      //--------------
 | 
				
			||||||
 | 
					      l = Chop_symm_subdiag(H,nrm, e, small);
 | 
				
			||||||
 | 
					      r = 0;    ///May have converged on more than one eval
 | 
				
			||||||
 | 
					      //Jasper
 | 
				
			||||||
 | 
					      //In this case
 | 
				
			||||||
 | 
					      // x  x  0  0  0  0
 | 
				
			||||||
 | 
					      // x  x  x  0  0  0
 | 
				
			||||||
 | 
					      // 0  x  x  x  0  0
 | 
				
			||||||
 | 
					      // 0  0  x  x  x  0
 | 
				
			||||||
 | 
					      // 0  0  0  x  x  0
 | 
				
			||||||
 | 
					      // 0  0  0  0  0  x  <- l
 | 
				
			||||||
 | 
					      //--------------
 | 
				
			||||||
 | 
					      ///Single eval
 | 
				
			||||||
 | 
					      if(l == N - 1)
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        evals[e] = H[l][l];
 | 
				
			||||||
 | 
					        N--;
 | 
				
			||||||
 | 
					        e++;
 | 
				
			||||||
 | 
					        r++;
 | 
				
			||||||
 | 
					        it = 0;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      //Jasper
 | 
				
			||||||
 | 
					      // x  x  0  0  0  0
 | 
				
			||||||
 | 
					      // x  x  x  0  0  0
 | 
				
			||||||
 | 
					      // 0  x  x  x  0  0
 | 
				
			||||||
 | 
					      // 0  0  x  x  0  0
 | 
				
			||||||
 | 
					      // 0  0  0  0  x  x  <- l
 | 
				
			||||||
 | 
					      // 0  0  0  0  x  x
 | 
				
			||||||
 | 
					      //--------------
 | 
				
			||||||
 | 
					      ///RealD eval
 | 
				
			||||||
 | 
					      if(l == N - 2)
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        trows[l + 1] = 1;    ///Needed for UTSolve
 | 
				
			||||||
 | 
					        apd = H[l][l] + H[l + 1][ l + 1];
 | 
				
			||||||
 | 
					        amd = H[l][l] - H[l + 1][l + 1];
 | 
				
			||||||
 | 
					        bc =  (T) 4.0 * H[l + 1][l] * H[l][l + 1];
 | 
				
			||||||
 | 
					        evals[e] = (T) 0.5 * (apd + sqrt(amd * amd + bc));
 | 
				
			||||||
 | 
					        evals[e + 1] = (T) 0.5 * (apd - sqrt(amd * amd + bc));
 | 
				
			||||||
 | 
					        N -= 2;
 | 
				
			||||||
 | 
					        e += 2;
 | 
				
			||||||
 | 
					        r++;
 | 
				
			||||||
 | 
					        it = 0;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }while(r > 0);
 | 
				
			||||||
 | 
					    //Jasper
 | 
				
			||||||
 | 
					    //Already converged
 | 
				
			||||||
 | 
					    //--------------
 | 
				
			||||||
 | 
					    if(N == 0) break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    DenseVector<T> ck,v; Resize(ck,2); Resize(v,2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for(int m = N - 3; m >= l; m--)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      ///Starting vector essentially random shift.
 | 
				
			||||||
 | 
					      if(it%10 == 0 && N >= 3 && it > 0)
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        t = abs(H[N - 1][N - 2]) + abs(H[N - 2][N - 3]);
 | 
				
			||||||
 | 
					        x = H[m][m] - t;
 | 
				
			||||||
 | 
					        z = H[m + 1][m];
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
					      ///Starting vector implicit Q theorem
 | 
				
			||||||
 | 
					        d = (H[N - 2][N - 2] - H[N - 1][N - 1]) * (T) 0.5;
 | 
				
			||||||
 | 
					        t =  H[N - 1][N - 1] - H[N - 1][N - 2] * H[N - 1][N - 2] 
 | 
				
			||||||
 | 
						  / (d + sign(d) * sqrt(d * d + H[N - 1][N - 2] * H[N - 1][N - 2]));
 | 
				
			||||||
 | 
					        x = H[m][m] - t;
 | 
				
			||||||
 | 
					        z = H[m + 1][m];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      //Jasper
 | 
				
			||||||
 | 
					      //why it is here????
 | 
				
			||||||
 | 
					      //-----------------------
 | 
				
			||||||
 | 
					      if(m == l)
 | 
				
			||||||
 | 
					        break;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      u = abs(H[m][m - 1]) * (abs(y) + abs(z));
 | 
				
			||||||
 | 
					      d = abs(x) * (abs(H[m - 1][m - 1]) + abs(H[m][m]) + abs(H[m + 1][m + 1]));
 | 
				
			||||||
 | 
					      if ((T)abs(u + d) == (T)abs(d))
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        l = m;
 | 
				
			||||||
 | 
					        break;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //Jasper
 | 
				
			||||||
 | 
					    if(it > 1000000)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      std::cout << "Wilkinson: bugger it got stuck after 100000 iterations" << std::endl;
 | 
				
			||||||
 | 
					      std::cout << "got " << e << " evals " << l << " " << N << std::endl;
 | 
				
			||||||
 | 
					      exit(1);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    //
 | 
				
			||||||
 | 
					    T s, c;
 | 
				
			||||||
 | 
					    Givens_calc<T>(x, z, c, s);
 | 
				
			||||||
 | 
					    Givens_mult<T>(H, l, l + 1, c, -s, 0);
 | 
				
			||||||
 | 
					    Givens_mult<T>(H, l, l + 1, c,  s, 1);
 | 
				
			||||||
 | 
					    Givens_mult<T>(P, l, l + 1, c,  s, 1);
 | 
				
			||||||
 | 
					    //
 | 
				
			||||||
 | 
					    for(int k = l; k < N - 2; ++k)
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      x = H.A[k + 1][k];
 | 
				
			||||||
 | 
					      z = H.A[k + 2][k];
 | 
				
			||||||
 | 
					      Givens_calc<T>(x, z, c, s);
 | 
				
			||||||
 | 
					      Givens_mult<T>(H, k + 1, k + 2, c, -s, 0);
 | 
				
			||||||
 | 
					      Givens_mult<T>(H, k + 1, k + 2, c,  s, 1);
 | 
				
			||||||
 | 
					      Givens_mult<T>(P, k + 1, k + 2, c,  s, 1);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    it++;
 | 
				
			||||||
 | 
					    tot_it++;
 | 
				
			||||||
 | 
					  }while(N > 1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  N = evals.size();
 | 
				
			||||||
 | 
					  ///Annoying - UT solves in reverse order;
 | 
				
			||||||
 | 
					  DenseVector<T> tmp(N);
 | 
				
			||||||
 | 
					  for(int i = 0; i < N; ++i)
 | 
				
			||||||
 | 
					    tmp[i] = evals[N-i-1];
 | 
				
			||||||
 | 
					  evals = tmp;
 | 
				
			||||||
 | 
					  //
 | 
				
			||||||
 | 
					  UTeigenvectors(H, trows, evals, evecs);
 | 
				
			||||||
 | 
					  //UTSymmEigenvectors(H, trows, evals, evecs);
 | 
				
			||||||
 | 
					  for(int i = 0; i < evals.size(); ++i)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    evecs[i] = P * evecs[i];
 | 
				
			||||||
 | 
					    normalize(evecs[i]);
 | 
				
			||||||
 | 
					    evals[i] = evals[i] * Hnorm;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  // // FIXME this is to test
 | 
				
			||||||
 | 
					  // Hin.write("evecs3", evecs);
 | 
				
			||||||
 | 
					  // Hin.write("evals3", evals);
 | 
				
			||||||
 | 
					  // // check rsd
 | 
				
			||||||
 | 
					  // for(int i = 0; i < M; i++) {
 | 
				
			||||||
 | 
					  //   vector<T> Aevec = Hin * evecs[i];
 | 
				
			||||||
 | 
					  //   RealD norm2(0.);
 | 
				
			||||||
 | 
					  //   for(int j = 0; j < M; j++) {
 | 
				
			||||||
 | 
					  //     norm2 += (Aevec[j] - evals[i] * evecs[i][j]) * (Aevec[j] - evals[i] * evecs[i][j]);
 | 
				
			||||||
 | 
					  //   }
 | 
				
			||||||
 | 
					  // }
 | 
				
			||||||
 | 
					  return tot_it;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <class T>
 | 
				
			||||||
 | 
					void Hess(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /**
 | 
				
			||||||
 | 
					  turn a matrix A =
 | 
				
			||||||
 | 
					  x  x  x  x  x
 | 
				
			||||||
 | 
					  x  x  x  x  x
 | 
				
			||||||
 | 
					  x  x  x  x  x
 | 
				
			||||||
 | 
					  x  x  x  x  x
 | 
				
			||||||
 | 
					  x  x  x  x  x
 | 
				
			||||||
 | 
					  into
 | 
				
			||||||
 | 
					  x  x  x  x  x
 | 
				
			||||||
 | 
					  x  x  x  x  x
 | 
				
			||||||
 | 
					  0  x  x  x  x
 | 
				
			||||||
 | 
					  0  0  x  x  x
 | 
				
			||||||
 | 
					  0  0  0  x  x
 | 
				
			||||||
 | 
					  with householder rotations
 | 
				
			||||||
 | 
					  Slow.
 | 
				
			||||||
 | 
					  */
 | 
				
			||||||
 | 
					  int N ; SizeSquare(A,N);
 | 
				
			||||||
 | 
					  DenseVector<T > p; Resize(p,N); Fill(p,0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int k=start;k<N-2;k++){
 | 
				
			||||||
 | 
					    //cerr << "hess" << k << std::endl;
 | 
				
			||||||
 | 
					    DenseVector<T > ck,v; Resize(ck,N-k-1); Resize(v,N-k-1);
 | 
				
			||||||
 | 
					    for(int i=k+1;i<N;i++){ck[i-k-1] = A(i,k);}  ///kth column
 | 
				
			||||||
 | 
					    normalize(ck);    ///Normalization cancels in PHP anyway
 | 
				
			||||||
 | 
					    T beta;
 | 
				
			||||||
 | 
					    Householder_vector<T >(ck, 0, ck.size()-1, v, beta);  ///Householder vector
 | 
				
			||||||
 | 
					    Householder_mult<T>(A,v,beta,start,k+1,N-1,0);  ///A -> PA
 | 
				
			||||||
 | 
					    Householder_mult<T >(A,v,beta,start,k+1,N-1,1);  ///PA -> PAP^H
 | 
				
			||||||
 | 
					    ///Accumulate eigenvector
 | 
				
			||||||
 | 
					    Householder_mult<T >(Q,v,beta,start,k+1,N-1,1);  ///Q -> QP^H
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  /*for(int l=0;l<N-2;l++){
 | 
				
			||||||
 | 
					    for(int k=l+2;k<N;k++){
 | 
				
			||||||
 | 
					    A(0,k,l);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    }*/
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <class T>
 | 
				
			||||||
 | 
					void Tri(DenseMatrix<T > &A, DenseMatrix<T> &Q, int start){
 | 
				
			||||||
 | 
					///Tridiagonalize a matrix
 | 
				
			||||||
 | 
					  int N; SizeSquare(A,N);
 | 
				
			||||||
 | 
					  Hess(A,Q,start);
 | 
				
			||||||
 | 
					  /*for(int l=0;l<N-2;l++){
 | 
				
			||||||
 | 
					    for(int k=l+2;k<N;k++){
 | 
				
			||||||
 | 
					    A(0,l,k);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    }*/
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <class T>
 | 
				
			||||||
 | 
					void ForceTridiagonal(DenseMatrix<T> &A){
 | 
				
			||||||
 | 
					///Tridiagonalize a matrix
 | 
				
			||||||
 | 
					  int N ; SizeSquare(A,N);
 | 
				
			||||||
 | 
					  for(int l=0;l<N-2;l++){
 | 
				
			||||||
 | 
					    for(int k=l+2;k<N;k++){
 | 
				
			||||||
 | 
					      A[l][k]=0;
 | 
				
			||||||
 | 
					      A[k][l]=0;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <class T>
 | 
				
			||||||
 | 
					int my_SymmEigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
 | 
				
			||||||
 | 
					  ///Solve a symmetric eigensystem, not necessarily in tridiagonal form
 | 
				
			||||||
 | 
					  int N; SizeSquare(Ain,N);
 | 
				
			||||||
 | 
					  DenseMatrix<T > A; A = Ain;
 | 
				
			||||||
 | 
					  DenseMatrix<T > Q; Resize(Q,N,N); Unity(Q);
 | 
				
			||||||
 | 
					  Tri(A,Q,0);
 | 
				
			||||||
 | 
					  int it = my_Wilkinson<T>(A, evals, evecs, small);
 | 
				
			||||||
 | 
					  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
 | 
				
			||||||
 | 
					  return it;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <class T>
 | 
				
			||||||
 | 
					int Wilkinson(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
 | 
				
			||||||
 | 
					  return my_Wilkinson(Ain, evals, evecs, small);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <class T>
 | 
				
			||||||
 | 
					int SymmEigensystem(DenseMatrix<T> &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
 | 
				
			||||||
 | 
					  return my_SymmEigensystem(Ain, evals, evecs, small);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <class T>
 | 
				
			||||||
 | 
					int Eigensystem(DenseMatrix<T > &Ain, DenseVector<T> &evals, DenseVector<DenseVector<T> > &evecs, RealD small){
 | 
				
			||||||
 | 
					///Solve a general eigensystem, not necessarily in tridiagonal form
 | 
				
			||||||
 | 
					  int N = Ain.dim;
 | 
				
			||||||
 | 
					  DenseMatrix<T > A(N); A = Ain;
 | 
				
			||||||
 | 
					  DenseMatrix<T > Q(N);Q.Unity();
 | 
				
			||||||
 | 
					  Hess(A,Q,0);
 | 
				
			||||||
 | 
					  int it = QReigensystem<T>(A, evals, evecs, small);
 | 
				
			||||||
 | 
					  for(int k=0;k<N;k++){evecs[k] = Q*evecs[k];}
 | 
				
			||||||
 | 
					  return it;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
							
								
								
									
										215
									
								
								lib/algorithms/iterative/Householder.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										215
									
								
								lib/algorithms/iterative/Householder.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,215 @@
 | 
				
			|||||||
 | 
					#ifndef HOUSEHOLDER_H
 | 
				
			||||||
 | 
					#define HOUSEHOLDER_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define TIMER(A) std::cout << GridLogMessage << __FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 | 
				
			||||||
 | 
					#define ENTER()  std::cout << GridLogMessage << "ENTRY "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 | 
				
			||||||
 | 
					#define LEAVE()  std::cout << GridLogMessage << "EXIT  "<<__FUNC__ << " file "<< __FILE__ <<" line " << __LINE__ << std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <cstdlib>
 | 
				
			||||||
 | 
					#include <string>
 | 
				
			||||||
 | 
					#include <cmath>
 | 
				
			||||||
 | 
					#include <iostream>
 | 
				
			||||||
 | 
					#include <sstream>
 | 
				
			||||||
 | 
					#include <stdexcept>
 | 
				
			||||||
 | 
					#include <fstream>
 | 
				
			||||||
 | 
					#include <complex>
 | 
				
			||||||
 | 
					#include <algorithm>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					/** Comparison function for finding the max element in a vector **/
 | 
				
			||||||
 | 
					template <class T> bool cf(T i, T j) { 
 | 
				
			||||||
 | 
					  return abs(i) < abs(j); 
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** 
 | 
				
			||||||
 | 
						Calculate a real Givens angle 
 | 
				
			||||||
 | 
					 **/
 | 
				
			||||||
 | 
					template <class T> inline void Givens_calc(T y, T z, T &c, T &s){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  RealD mz = (RealD)abs(z);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if(mz==0.0){
 | 
				
			||||||
 | 
					    c = 1; s = 0;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  if(mz >= (RealD)abs(y)){
 | 
				
			||||||
 | 
					    T t = -y/z;
 | 
				
			||||||
 | 
					    s = (T)1.0 / sqrt ((T)1.0 + t * t);
 | 
				
			||||||
 | 
					    c = s * t;
 | 
				
			||||||
 | 
					  } else {
 | 
				
			||||||
 | 
					    T t = -z/y;
 | 
				
			||||||
 | 
					    c = (T)1.0 / sqrt ((T)1.0 + t * t);
 | 
				
			||||||
 | 
					    s = c * t;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <class T> inline void Givens_mult(DenseMatrix<T> &A,  int i, int k, T c, T s, int dir)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int q ; SizeSquare(A,q);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if(dir == 0){
 | 
				
			||||||
 | 
					    for(int j=0;j<q;j++){
 | 
				
			||||||
 | 
					      T nu = A[i][j];
 | 
				
			||||||
 | 
					      T w  = A[k][j];
 | 
				
			||||||
 | 
					      A[i][j] = (c*nu + s*w);
 | 
				
			||||||
 | 
					      A[k][j] = (-s*nu + c*w);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if(dir == 1){
 | 
				
			||||||
 | 
					    for(int j=0;j<q;j++){
 | 
				
			||||||
 | 
					      T nu = A[j][i];
 | 
				
			||||||
 | 
					      T w  = A[j][k];
 | 
				
			||||||
 | 
					      A[j][i] = (c*nu - s*w);
 | 
				
			||||||
 | 
					      A[j][k] = (s*nu + c*w);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
						from input = x;
 | 
				
			||||||
 | 
						Compute the complex Householder vector, v, such that
 | 
				
			||||||
 | 
						P = (I - b v transpose(v) )
 | 
				
			||||||
 | 
						b = 2/v.v
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						P | x |    | x | k = 0
 | 
				
			||||||
 | 
						| x |    | 0 | 
 | 
				
			||||||
 | 
						| x | =  | 0 |
 | 
				
			||||||
 | 
						| x |    | 0 | j = 3
 | 
				
			||||||
 | 
						| x |	   | x |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						These are the "Unreduced" Householder vectors.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 **/
 | 
				
			||||||
 | 
					template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, DenseVector<T> &v, T &beta)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int N ; Size(input,N);
 | 
				
			||||||
 | 
					  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf<T> );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if(abs(m) > 0.0){
 | 
				
			||||||
 | 
					    T alpha = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for(int i=k; i<j+1; i++){
 | 
				
			||||||
 | 
					      v[i] = input[i]/m;
 | 
				
			||||||
 | 
					      alpha = alpha + v[i]*conj(v[i]);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    alpha = sqrt(alpha);
 | 
				
			||||||
 | 
					    beta = (T)1.0/(alpha*(alpha + abs(v[k]) ));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if(abs(v[k]) > 0.0)  v[k] = v[k] + (v[k]/abs(v[k]))*alpha;
 | 
				
			||||||
 | 
					    else                 v[k] = -alpha;
 | 
				
			||||||
 | 
					  } else{
 | 
				
			||||||
 | 
					    for(int i=k; i<j+1; i++){
 | 
				
			||||||
 | 
					      v[i] = 0.0;
 | 
				
			||||||
 | 
					    } 
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
						from input = x;
 | 
				
			||||||
 | 
						Compute the complex Householder vector, v, such that
 | 
				
			||||||
 | 
						P = (I - b v transpose(v) )
 | 
				
			||||||
 | 
						b = 2/v.v
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						Px = alpha*e_dir
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						These are the "Unreduced" Householder vectors.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 **/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <class T> inline void Householder_vector(DenseVector<T> input, int k, int j, int dir, DenseVector<T> &v, T &beta)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int N = input.size();
 | 
				
			||||||
 | 
					  T m = *max_element(input.begin() + k, input.begin() + j + 1, cf);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if(abs(m) > 0.0){
 | 
				
			||||||
 | 
					    T alpha = 0;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for(int i=k; i<j+1; i++){
 | 
				
			||||||
 | 
					      v[i] = input[i]/m;
 | 
				
			||||||
 | 
					      alpha = alpha + v[i]*conj(v[i]);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    alpha = sqrt(alpha);
 | 
				
			||||||
 | 
					    beta = 1.0/(alpha*(alpha + abs(v[dir]) ));
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
					    if(abs(v[dir]) > 0.0) v[dir] = v[dir] + (v[dir]/abs(v[dir]))*alpha;
 | 
				
			||||||
 | 
					    else                  v[dir] = -alpha;
 | 
				
			||||||
 | 
					  }else{
 | 
				
			||||||
 | 
					    for(int i=k; i<j+1; i++){
 | 
				
			||||||
 | 
					      v[i] = 0.0;
 | 
				
			||||||
 | 
					    } 
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
						Compute the product PA if trans = 0
 | 
				
			||||||
 | 
						AP if trans = 1
 | 
				
			||||||
 | 
						P = (I - b v transpose(v) )
 | 
				
			||||||
 | 
						b = 2/v.v
 | 
				
			||||||
 | 
						start at element l of matrix A
 | 
				
			||||||
 | 
						v is of length j - k + 1 of v are nonzero
 | 
				
			||||||
 | 
					 **/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template <class T> inline void Householder_mult(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int k, int j, int trans)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int N ; SizeSquare(A,N);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if(abs(beta) > 0.0){
 | 
				
			||||||
 | 
					    for(int p=l; p<N; p++){
 | 
				
			||||||
 | 
					      T s = 0;
 | 
				
			||||||
 | 
					      if(trans==0){
 | 
				
			||||||
 | 
						for(int i=k;i<j+1;i++) s += conj(v[i-k])*A[i][p];
 | 
				
			||||||
 | 
						s *= beta;
 | 
				
			||||||
 | 
						for(int i=k;i<j+1;i++){ A[i][p] = A[i][p]-s*conj(v[i-k]);}
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
						for(int i=k;i<j+1;i++){ s += conj(v[i-k])*A[p][i];}
 | 
				
			||||||
 | 
						s *= beta;
 | 
				
			||||||
 | 
						for(int i=k;i<j+1;i++){ A[p][i]=A[p][i]-s*conj(v[i-k]);}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
						Compute the product PA if trans = 0
 | 
				
			||||||
 | 
						AP if trans = 1
 | 
				
			||||||
 | 
						P = (I - b v transpose(v) )
 | 
				
			||||||
 | 
						b = 2/v.v
 | 
				
			||||||
 | 
						start at element l of matrix A
 | 
				
			||||||
 | 
						v is of length j - k + 1 of v are nonzero
 | 
				
			||||||
 | 
						A is tridiagonal
 | 
				
			||||||
 | 
					 **/
 | 
				
			||||||
 | 
					template <class T> inline void Householder_mult_tri(DenseMatrix<T> &A , DenseVector<T> v, T beta, int l, int M, int k, int j, int trans)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  if(abs(beta) > 0.0){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    int N ; SizeSquare(A,N);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    DenseMatrix<T> tmp; Resize(tmp,N,N); Fill(tmp,0); 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    T s;
 | 
				
			||||||
 | 
					    for(int p=l; p<M; p++){
 | 
				
			||||||
 | 
					      s = 0;
 | 
				
			||||||
 | 
					      if(trans==0){
 | 
				
			||||||
 | 
						for(int i=k;i<j+1;i++) s = s + conj(v[i-k])*A[i][p];
 | 
				
			||||||
 | 
					      }else{
 | 
				
			||||||
 | 
						for(int i=k;i<j+1;i++) s = s + v[i-k]*A[p][i];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      s = beta*s;
 | 
				
			||||||
 | 
					      if(trans==0){
 | 
				
			||||||
 | 
						for(int i=k;i<j+1;i++) tmp[i][p] = tmp(i,p) - s*v[i-k];
 | 
				
			||||||
 | 
					      }else{
 | 
				
			||||||
 | 
						for(int i=k;i<j+1;i++) tmp[p][i] = tmp[p][i] - s*conj(v[i-k]);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    for(int p=l; p<M; p++){
 | 
				
			||||||
 | 
					      if(trans==0){
 | 
				
			||||||
 | 
						for(int i=k;i<j+1;i++) A[i][p] = A[i][p] + tmp[i][p];
 | 
				
			||||||
 | 
					      }else{
 | 
				
			||||||
 | 
						for(int i=k;i<j+1;i++) A[p][i] = A[p][i] + tmp[p][i];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										426
									
								
								lib/algorithms/iterative/Matrix.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										426
									
								
								lib/algorithms/iterative/Matrix.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,426 @@
 | 
				
			|||||||
 | 
					#ifndef MATRIX_H
 | 
				
			||||||
 | 
					#define MATRIX_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <cstdlib>
 | 
				
			||||||
 | 
					#include <string>
 | 
				
			||||||
 | 
					#include <cmath>
 | 
				
			||||||
 | 
					#include <vector>
 | 
				
			||||||
 | 
					#include <iostream>
 | 
				
			||||||
 | 
					#include <iomanip>
 | 
				
			||||||
 | 
					#include <complex>
 | 
				
			||||||
 | 
					#include <typeinfo>
 | 
				
			||||||
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** Sign function **/
 | 
				
			||||||
 | 
					template <class T> T sign(T p){return ( p/abs(p) );}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					///////////////////// Hijack STL containers for our wicked means /////////////////////////////////////////
 | 
				
			||||||
 | 
					/////////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
				
			||||||
 | 
					template<class T> using Vector = Vector<T>;
 | 
				
			||||||
 | 
					template<class T> using Matrix = Vector<Vector<T> >;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class T> void Resize(Vector<T > & vec, int N) { vec.resize(N); }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class T> void Resize(Matrix<T > & mat, int N, int M) { 
 | 
				
			||||||
 | 
					  mat.resize(N);
 | 
				
			||||||
 | 
					  for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
					    mat[i].resize(M);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					template<class T> void Size(Vector<T> & vec, int &N) 
 | 
				
			||||||
 | 
					{ 
 | 
				
			||||||
 | 
					  N= vec.size();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					template<class T> void Size(Matrix<T> & mat, int &N,int &M) 
 | 
				
			||||||
 | 
					{ 
 | 
				
			||||||
 | 
					  N= mat.size();
 | 
				
			||||||
 | 
					  M= mat[0].size();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					template<class T> void SizeSquare(Matrix<T> & mat, int &N) 
 | 
				
			||||||
 | 
					{ 
 | 
				
			||||||
 | 
					  int M; Size(mat,N,M);
 | 
				
			||||||
 | 
					  assert(N==M);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					template<class T> void SizeSame(Matrix<T> & mat1,Matrix<T> &mat2, int &N1,int &M1) 
 | 
				
			||||||
 | 
					{ 
 | 
				
			||||||
 | 
					  int N2,M2;
 | 
				
			||||||
 | 
					  Size(mat1,N1,M1);
 | 
				
			||||||
 | 
					  Size(mat2,N2,M2);
 | 
				
			||||||
 | 
					  assert(N1==N2);
 | 
				
			||||||
 | 
					  assert(M1==M2);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					//*****************************************
 | 
				
			||||||
 | 
					//*	(Complex) Vector operations	*
 | 
				
			||||||
 | 
					//*****************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**Conj of a Vector **/
 | 
				
			||||||
 | 
					template <class T> Vector<T> conj(Vector<T> p){
 | 
				
			||||||
 | 
						Vector<T> q(p.size());
 | 
				
			||||||
 | 
						for(int i=0;i<p.size();i++){q[i] = conj(p[i]);}
 | 
				
			||||||
 | 
						return q;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** Norm of a Vector**/
 | 
				
			||||||
 | 
					template <class T> T norm(Vector<T> p){
 | 
				
			||||||
 | 
						T sum = 0;
 | 
				
			||||||
 | 
						for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);}
 | 
				
			||||||
 | 
						return abs(sqrt(sum));
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** Norm squared of a Vector **/
 | 
				
			||||||
 | 
					template <class T> T norm2(Vector<T> p){
 | 
				
			||||||
 | 
						T sum = 0;
 | 
				
			||||||
 | 
						for(int i=0;i<p.size();i++){sum = sum + p[i]*conj(p[i]);}
 | 
				
			||||||
 | 
						return abs((sum));
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** Sum elements of a Vector **/
 | 
				
			||||||
 | 
					template <class T> T trace(Vector<T> p){
 | 
				
			||||||
 | 
						T sum = 0;
 | 
				
			||||||
 | 
						for(int i=0;i<p.size();i++){sum = sum + p[i];}
 | 
				
			||||||
 | 
						return sum;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** Fill a Vector with constant c **/
 | 
				
			||||||
 | 
					template <class T> void Fill(Vector<T> &p, T c){
 | 
				
			||||||
 | 
						for(int i=0;i<p.size();i++){p[i] = c;}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					/** Normalize a Vector **/
 | 
				
			||||||
 | 
					template <class T> void normalize(Vector<T> &p){
 | 
				
			||||||
 | 
						T m = norm(p);
 | 
				
			||||||
 | 
						if( abs(m) > 0.0) for(int i=0;i<p.size();i++){p[i] /= m;}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					/** Vector by scalar **/
 | 
				
			||||||
 | 
					template <class T, class U> Vector<T> times(Vector<T> p, U s){
 | 
				
			||||||
 | 
						for(int i=0;i<p.size();i++){p[i] *= s;}
 | 
				
			||||||
 | 
						return p;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					template <class T, class U> Vector<T> times(U s, Vector<T> p){
 | 
				
			||||||
 | 
						for(int i=0;i<p.size();i++){p[i] *= s;}
 | 
				
			||||||
 | 
						return p;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					/** inner product of a and b = conj(a) . b **/
 | 
				
			||||||
 | 
					template <class T> T inner(Vector<T> a, Vector<T> b){
 | 
				
			||||||
 | 
						T m = 0.;
 | 
				
			||||||
 | 
						for(int i=0;i<a.size();i++){m = m + conj(a[i])*b[i];}
 | 
				
			||||||
 | 
						return m;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					/** sum of a and b = a + b **/
 | 
				
			||||||
 | 
					template <class T> Vector<T> add(Vector<T> a, Vector<T> b){
 | 
				
			||||||
 | 
						Vector<T> m(a.size());
 | 
				
			||||||
 | 
						for(int i=0;i<a.size();i++){m[i] = a[i] + b[i];}
 | 
				
			||||||
 | 
						return m;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					/** sum of a and b = a - b **/
 | 
				
			||||||
 | 
					template <class T> Vector<T> sub(Vector<T> a, Vector<T> b){
 | 
				
			||||||
 | 
						Vector<T> m(a.size());
 | 
				
			||||||
 | 
						for(int i=0;i<a.size();i++){m[i] = a[i] - b[i];}
 | 
				
			||||||
 | 
						return m;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** 
 | 
				
			||||||
 | 
					 *********************************
 | 
				
			||||||
 | 
					 *	Matrices	         *
 | 
				
			||||||
 | 
					 *********************************
 | 
				
			||||||
 | 
					 **/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class T> void Fill(Matrix<T> & mat, T&val) { 
 | 
				
			||||||
 | 
					  int N,M;
 | 
				
			||||||
 | 
					  Size(mat,N,M);
 | 
				
			||||||
 | 
					  for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
					  for(int j=0;j<M;j++){
 | 
				
			||||||
 | 
					    mat[i][j] = val;
 | 
				
			||||||
 | 
					  }}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** Transpose of a matrix **/
 | 
				
			||||||
 | 
					Matrix<T> Transpose(Matrix<T> & mat){
 | 
				
			||||||
 | 
					  int N,M;
 | 
				
			||||||
 | 
					  Size(mat,N,M);
 | 
				
			||||||
 | 
					  Matrix C; Resize(C,M,N);
 | 
				
			||||||
 | 
					  for(int i=0;i<M;i++){
 | 
				
			||||||
 | 
					  for(int j=0;j<N;j++){
 | 
				
			||||||
 | 
					    C[i][j] = mat[j][i];
 | 
				
			||||||
 | 
					  }} 
 | 
				
			||||||
 | 
					  return C;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					/** Set Matrix to unit matrix **/
 | 
				
			||||||
 | 
					template<class T> void Unity(Matrix<T> &mat){
 | 
				
			||||||
 | 
					  int N;  SizeSquare(mat,N);
 | 
				
			||||||
 | 
					  for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
					    for(int j=0;j<N;j++){
 | 
				
			||||||
 | 
					      if ( i==j ) A[i][j] = 1;
 | 
				
			||||||
 | 
					      else        A[i][j] = 0;
 | 
				
			||||||
 | 
					    } 
 | 
				
			||||||
 | 
					  } 
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					/** Add C * I to matrix **/
 | 
				
			||||||
 | 
					template<class T>
 | 
				
			||||||
 | 
					void PlusUnit(Matrix<T> & A,T c){
 | 
				
			||||||
 | 
					  int dim;  SizeSquare(A,dim);
 | 
				
			||||||
 | 
					  for(int i=0;i<dim;i++){A[i][i] = A[i][i] + c;} 
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** return the Hermitian conjugate of matrix **/
 | 
				
			||||||
 | 
					Matrix<T> HermitianConj(Matrix<T> &mat){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int dim; SizeSquare(mat,dim);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Matrix<T> C; Resize(C,dim,dim);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int i=0;i<dim;i++){
 | 
				
			||||||
 | 
					    for(int j=0;j<dim;j++){
 | 
				
			||||||
 | 
					      C[i][j] = conj(mat[j][i]);
 | 
				
			||||||
 | 
					    } 
 | 
				
			||||||
 | 
					  } 
 | 
				
			||||||
 | 
					  return C;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** return diagonal entries as a Vector **/
 | 
				
			||||||
 | 
					Vector<T> diag(Matrix<T> &A)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int dim; SizeSquare(A,dim);
 | 
				
			||||||
 | 
					  Vector<T> d; Resize(d,dim);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int i=0;i<dim;i++){
 | 
				
			||||||
 | 
					    d[i] = A[i][i];
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return d;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** Left multiply by a Vector **/
 | 
				
			||||||
 | 
					Vector<T> operator *(Vector<T> &B,Matrix<T> &A)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int K,M,N; 
 | 
				
			||||||
 | 
					  Size(B,K);
 | 
				
			||||||
 | 
					  Size(A,M,N);
 | 
				
			||||||
 | 
					  assert(K==M);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  Vector<T> C; Resize(C,N);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int j=0;j<N;j++){
 | 
				
			||||||
 | 
					    T sum = 0.0;
 | 
				
			||||||
 | 
					    for(int i=0;i<M;i++){
 | 
				
			||||||
 | 
					      sum += B[i] * A[i][j];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    C[j] =  sum;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return C; 
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** return 1/diagonal entries as a Vector **/
 | 
				
			||||||
 | 
					Vector<T> inv_diag(Matrix<T> & A){
 | 
				
			||||||
 | 
					  int dim; SizeSquare(A,dim);
 | 
				
			||||||
 | 
					  Vector<T> d; Resize(d,dim);
 | 
				
			||||||
 | 
					  for(int i=0;i<dim;i++){
 | 
				
			||||||
 | 
					    d[i] = 1.0/A[i][i];
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return d;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					/** Matrix Addition **/
 | 
				
			||||||
 | 
					inline Matrix<T> operator + (Matrix<T> &A,Matrix<T> &B)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int N,M  ; SizeSame(A,B,N,M);
 | 
				
			||||||
 | 
					  Matrix C; Resize(C,N,M);
 | 
				
			||||||
 | 
					  for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
					    for(int j=0;j<M;j++){
 | 
				
			||||||
 | 
					      C[i][j] = A[i][j] +  B[i][j];
 | 
				
			||||||
 | 
					    } 
 | 
				
			||||||
 | 
					  } 
 | 
				
			||||||
 | 
					  return C;
 | 
				
			||||||
 | 
					} 
 | 
				
			||||||
 | 
					/** Matrix Subtraction **/
 | 
				
			||||||
 | 
					inline Matrix<T> operator- (Matrix<T> & A,Matrix<T> &B){
 | 
				
			||||||
 | 
					  int N,M  ; SizeSame(A,B,N,M);
 | 
				
			||||||
 | 
					  Matrix C; Resize(C,N,M);
 | 
				
			||||||
 | 
					  for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
					  for(int j=0;j<M;j++){
 | 
				
			||||||
 | 
					    C[i][j] = A[i][j] -  B[i][j];
 | 
				
			||||||
 | 
					  }}
 | 
				
			||||||
 | 
					  return C;
 | 
				
			||||||
 | 
					} 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** Matrix scalar multiplication **/
 | 
				
			||||||
 | 
					inline Matrix<T> operator* (Matrix<T> & A,T c){
 | 
				
			||||||
 | 
					  int N,M; Size(A,N,M);
 | 
				
			||||||
 | 
					  Matrix C; Resize(C,N,M);
 | 
				
			||||||
 | 
					  for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
					  for(int j=0;j<M;j++){
 | 
				
			||||||
 | 
					    C[i][j] = A[i][j]*c;
 | 
				
			||||||
 | 
					  }} 
 | 
				
			||||||
 | 
					  return C;
 | 
				
			||||||
 | 
					} 
 | 
				
			||||||
 | 
					/** Matrix Matrix multiplication **/
 | 
				
			||||||
 | 
					inline Matrix<T> operator* (Matrix<T> &A,Matrix<T> &B){
 | 
				
			||||||
 | 
					  int K,L,N,M;
 | 
				
			||||||
 | 
					  Size(A,K,L);
 | 
				
			||||||
 | 
					  Size(B,N,M); assert(L==N);
 | 
				
			||||||
 | 
					  Matrix C; Resize(C,K,M);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int i=0;i<K;i++){
 | 
				
			||||||
 | 
					    for(int j=0;j<M;j++){
 | 
				
			||||||
 | 
					      T sum = 0.0;
 | 
				
			||||||
 | 
					      for(int k=0;k<N;k++) sum += A[i][k]*B[k][j];
 | 
				
			||||||
 | 
					      C[i][j] =sum;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return C; 
 | 
				
			||||||
 | 
					} 
 | 
				
			||||||
 | 
					/** Matrix Vector multiplication **/
 | 
				
			||||||
 | 
					inline Vector<T> operator* (Matrix<T> &A,Vector<T> &B){
 | 
				
			||||||
 | 
					  int M,N,K;
 | 
				
			||||||
 | 
					  Size(A,N,M);
 | 
				
			||||||
 | 
					  Size(B,K); assert(K==M);
 | 
				
			||||||
 | 
					  Vector<T> C; Resize(C,N);
 | 
				
			||||||
 | 
					  for(int i=0;i<N;i++){
 | 
				
			||||||
 | 
					    T sum = 0.0;
 | 
				
			||||||
 | 
					    for(int j=0;j<M;j++) sum += A[i][j]*B[j];
 | 
				
			||||||
 | 
					    C[i] =  sum;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return C; 
 | 
				
			||||||
 | 
					} 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** Some version of Matrix norm **/
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					inline T Norm(){ // this is not a usual L2 norm
 | 
				
			||||||
 | 
					    T norm = 0;
 | 
				
			||||||
 | 
					    for(int i=0;i<dim;i++){
 | 
				
			||||||
 | 
					      for(int j=0;j<dim;j++){
 | 
				
			||||||
 | 
						norm += abs(A[i][j]);
 | 
				
			||||||
 | 
					    }}
 | 
				
			||||||
 | 
					    return norm;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					*/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** Some version of Matrix norm **/
 | 
				
			||||||
 | 
					template<class T> T LargestDiag(Matrix<T> &A)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int dim ; SizeSquare(A,dim); 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  T ld = abs(A[0][0]);
 | 
				
			||||||
 | 
					  for(int i=1;i<dim;i++){
 | 
				
			||||||
 | 
					    T cf = abs(A[i][i]);
 | 
				
			||||||
 | 
					    if(abs(cf) > abs(ld) ){ld = cf;}
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return ld;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** Look for entries on the leading subdiagonal that are smaller than 'small' **/
 | 
				
			||||||
 | 
					template <class T,class U> int Chop_subdiag(Matrix<T> &A,T norm, int offset, U small)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int dim; SizeSquare(A,dim);
 | 
				
			||||||
 | 
					  for(int l = dim - 1 - offset; l >= 1; l--) {             		
 | 
				
			||||||
 | 
					    if((U)abs(A[l][l - 1]) < (U)small) {
 | 
				
			||||||
 | 
					      A[l][l-1]=(U)0.0;
 | 
				
			||||||
 | 
					      return l;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/** Look for entries on the leading subdiagonal that are smaller than 'small' **/
 | 
				
			||||||
 | 
					template <class T,class U> int Chop_symm_subdiag(Matrix<T> & A,T norm, int offset, U small) 
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int dim; SizeSquare(A,dim);
 | 
				
			||||||
 | 
					  for(int l = dim - 1 - offset; l >= 1; l--) {
 | 
				
			||||||
 | 
					    if((U)abs(A[l][l - 1]) < (U)small) {
 | 
				
			||||||
 | 
					      A[l][l - 1] = (U)0.0;
 | 
				
			||||||
 | 
					      A[l - 1][l] = (U)0.0;
 | 
				
			||||||
 | 
					      return l;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return 0;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					/**Assign a submatrix to a larger one**/
 | 
				
			||||||
 | 
					template<class T>
 | 
				
			||||||
 | 
					void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  for(int i = row_st; i<row_end; i++){
 | 
				
			||||||
 | 
					    for(int j = col_st; j<col_end; j++){
 | 
				
			||||||
 | 
					      A[i][j] = S[i - row_st][j - col_st];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**Get a square submatrix**/
 | 
				
			||||||
 | 
					template <class T>
 | 
				
			||||||
 | 
					Matrix<T> GetSubMtx(Matrix<T> &A,int row_st, int row_end, int col_st, int col_end)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  Matrix<T> H; Resize(row_end - row_st,col_end-col_st);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int i = row_st; i<row_end; i++){
 | 
				
			||||||
 | 
					  for(int j = col_st; j<col_end; j++){
 | 
				
			||||||
 | 
					    H[i-row_st][j-col_st]=A[i][j];
 | 
				
			||||||
 | 
					  }}
 | 
				
			||||||
 | 
					  return H;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					 /**Assign a submatrix to a larger one NB remember Vector Vectors are transposes of the matricies they represent**/
 | 
				
			||||||
 | 
					template<class T>
 | 
				
			||||||
 | 
					void AssignSubMtx(Matrix<T> & A,int row_st, int row_end, int col_st, int col_end, Matrix<T> &S)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  for(int i = row_st; i<row_end; i++){
 | 
				
			||||||
 | 
					  for(int j = col_st; j<col_end; j++){
 | 
				
			||||||
 | 
					    A[i][j] = S[i - row_st][j - col_st];
 | 
				
			||||||
 | 
					  }}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					/** compute b_i A_ij b_j **/ // surprised no Conj
 | 
				
			||||||
 | 
					template<class T> T proj(Matrix<T> A, Vector<T> B){
 | 
				
			||||||
 | 
					  int dim; SizeSquare(A,dim);
 | 
				
			||||||
 | 
					  int dimB; Size(B,dimB);
 | 
				
			||||||
 | 
					  assert(dimB==dim);
 | 
				
			||||||
 | 
					  T C = 0;
 | 
				
			||||||
 | 
					  for(int i=0;i<dim;i++){
 | 
				
			||||||
 | 
					    T sum = 0.0;
 | 
				
			||||||
 | 
					    for(int j=0;j<dim;j++){
 | 
				
			||||||
 | 
					      sum += A[i][j]*B[j];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    C +=  B[i]*sum; // No conj?
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  return C; 
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 *************************************************************
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Matrix Vector products
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 *************************************************************
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					// Instead make a linop and call my CG;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/// q -> q Q
 | 
				
			||||||
 | 
					template <class T,class Fermion> void times(Vector<Fermion> &q, Matrix<T> &Q)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int M; SizeSquare(Q,M);
 | 
				
			||||||
 | 
					  int N; Size(q,N); 
 | 
				
			||||||
 | 
					  assert(M==N);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  times(q,Q,N);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/// q -> q Q
 | 
				
			||||||
 | 
					template <class T> void times(multi1d<LatticeFermion> &q, Matrix<T> &Q, int N)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  GridBase *grid = q[0]._grid;
 | 
				
			||||||
 | 
					  int M; SizeSquare(Q,M);
 | 
				
			||||||
 | 
					  int K; Size(q,K); 
 | 
				
			||||||
 | 
					  assert(N<M);
 | 
				
			||||||
 | 
					  assert(N<K);
 | 
				
			||||||
 | 
					  Vector<Fermion> S(N,grid );
 | 
				
			||||||
 | 
					  for(int j=0;j<N;j++){
 | 
				
			||||||
 | 
					    S[j] = zero;
 | 
				
			||||||
 | 
					    for(int k=0;k<N;k++){
 | 
				
			||||||
 | 
					      S[j] = S[j] +  q[k]* Q[k][j]; 
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  for(int j=0;j<q.size();j++){
 | 
				
			||||||
 | 
					    q[j] = S[j];
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
							
								
								
									
										122
									
								
								lib/algorithms/iterative/bisec.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										122
									
								
								lib/algorithms/iterative/bisec.c
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,122 @@
 | 
				
			|||||||
 | 
					#include <math.h>
 | 
				
			||||||
 | 
					#include <stdlib.h>
 | 
				
			||||||
 | 
					#include <vector>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct Bisection {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void get_eig2(int row_num,std::vector<RealD> &ALPHA,std::vector<RealD> &BETA, std::vector<RealD> & eig)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int i,j;
 | 
				
			||||||
 | 
					  std::vector<RealD> evec1(row_num+3);
 | 
				
			||||||
 | 
					  std::vector<RealD> evec2(row_num+3);
 | 
				
			||||||
 | 
					  RealD eps2;
 | 
				
			||||||
 | 
					  ALPHA[1]=0.;
 | 
				
			||||||
 | 
					  BETHA[1]=0.;
 | 
				
			||||||
 | 
					  for(i=0;i<row_num-1;i++) {
 | 
				
			||||||
 | 
					    ALPHA[i+1] = A[i*(row_num+1)].real();
 | 
				
			||||||
 | 
					    BETHA[i+2] = A[i*(row_num+1)+1].real();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  ALPHA[row_num] = A[(row_num-1)*(row_num+1)].real();
 | 
				
			||||||
 | 
					  bisec(ALPHA,BETHA,row_num,1,row_num,1e-10,1e-10,evec1,eps2);
 | 
				
			||||||
 | 
					  bisec(ALPHA,BETHA,row_num,1,row_num,1e-16,1e-16,evec2,eps2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Do we really need to sort here?
 | 
				
			||||||
 | 
					  int begin=1;
 | 
				
			||||||
 | 
					  int end = row_num;
 | 
				
			||||||
 | 
					  int swapped=1;
 | 
				
			||||||
 | 
					  while(swapped) {
 | 
				
			||||||
 | 
					    swapped=0;
 | 
				
			||||||
 | 
					    for(i=begin;i<end;i++){
 | 
				
			||||||
 | 
					      if(mag(evec2[i])>mag(evec2[i+1]))	{
 | 
				
			||||||
 | 
						swap(evec2+i,evec2+i+1);
 | 
				
			||||||
 | 
						swapped=1;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    end--;
 | 
				
			||||||
 | 
					    for(i=end-1;i>=begin;i--){
 | 
				
			||||||
 | 
					      if(mag(evec2[i])>mag(evec2[i+1]))	{
 | 
				
			||||||
 | 
						swap(evec2+i,evec2+i+1);
 | 
				
			||||||
 | 
						swapped=1;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    begin++;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(i=0;i<row_num;i++){
 | 
				
			||||||
 | 
					    for(j=0;j<row_num;j++) {
 | 
				
			||||||
 | 
					      if(i==j) H[i*row_num+j]=evec2[i+1];
 | 
				
			||||||
 | 
					      else H[i*row_num+j]=0.;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void bisec(std::vector<RealD> &c,   
 | 
				
			||||||
 | 
							  std::vector<RealD> &b,
 | 
				
			||||||
 | 
							  int n,
 | 
				
			||||||
 | 
							  int m1,
 | 
				
			||||||
 | 
							  int m2,
 | 
				
			||||||
 | 
							  RealD eps1,
 | 
				
			||||||
 | 
							  RealD relfeh,
 | 
				
			||||||
 | 
							  std::vector<RealD> &x,
 | 
				
			||||||
 | 
							  RealD &eps2)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  std::vector<RealD> wu(n+2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  RealD h,q,x1,xu,x0,xmin,xmax; 
 | 
				
			||||||
 | 
					  int i,a,k;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  b[1]=0.0;
 | 
				
			||||||
 | 
					  xmin=c[n]-fabs(b[n]);
 | 
				
			||||||
 | 
					  xmax=c[n]+fabs(b[n]);
 | 
				
			||||||
 | 
					  for(i=1;i<n;i++){
 | 
				
			||||||
 | 
					    h=fabs(b[i])+fabs(b[i+1]);
 | 
				
			||||||
 | 
					    if(c[i]+h>xmax) xmax= c[i]+h;
 | 
				
			||||||
 | 
					    if(c[i]-h<xmin) xmin= c[i]-h;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  xmax *=2.;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  eps2=relfeh*((xmin+xmax)>0.0 ? xmax : -xmin);
 | 
				
			||||||
 | 
					  if(eps1<=0.0) eps1=eps2;
 | 
				
			||||||
 | 
					  eps2=0.5*eps1+7.0*(eps2);
 | 
				
			||||||
 | 
					  x0=xmax;
 | 
				
			||||||
 | 
					  for(i=m1;i<=m2;i++){
 | 
				
			||||||
 | 
					    x[i]=xmax;
 | 
				
			||||||
 | 
					    wu[i]=xmin;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(k=m2;k>=m1;k--){
 | 
				
			||||||
 | 
					    xu=xmin;
 | 
				
			||||||
 | 
					    i=k;
 | 
				
			||||||
 | 
					    do{
 | 
				
			||||||
 | 
					      if(xu<wu[i]){
 | 
				
			||||||
 | 
						xu=wu[i];
 | 
				
			||||||
 | 
						i=m1-1;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      i--;
 | 
				
			||||||
 | 
					    }while(i>=m1);
 | 
				
			||||||
 | 
					    if(x0>x[k]) x0=x[k];
 | 
				
			||||||
 | 
					    while((x0-xu)>2*relfeh*(fabs(xu)+fabs(x0))+eps1){
 | 
				
			||||||
 | 
					      x1=(xu+x0)/2;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      a=0;
 | 
				
			||||||
 | 
					      q=1.0;
 | 
				
			||||||
 | 
					      for(i=1;i<=n;i++){
 | 
				
			||||||
 | 
						q=c[i]-x1-((q!=0.0)? b[i]*b[i]/q:fabs(b[i])/relfeh);
 | 
				
			||||||
 | 
						if(q<0) a++;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      //			printf("x1=%e a=%d\n",x1,a);
 | 
				
			||||||
 | 
					      if(a<k){
 | 
				
			||||||
 | 
						if(a<m1){
 | 
				
			||||||
 | 
						  xu=x1;
 | 
				
			||||||
 | 
						  wu[m1]=x1;
 | 
				
			||||||
 | 
						}else {
 | 
				
			||||||
 | 
						  xu=x1;
 | 
				
			||||||
 | 
						  wu[a+1]=x1;
 | 
				
			||||||
 | 
						  if(x[a]>x1) x[a]=x1;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }else x0=x1;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    x[k]=(x0+xu)/2;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										1
									
								
								lib/algorithms/iterative/get_eig.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								lib/algorithms/iterative/get_eig.c
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
@@ -29,16 +29,27 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
 | 
				
			|||||||
  
 | 
					  
 | 
				
			||||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
					  int e1=rhs._grid->_slice_nblock[dimension];
 | 
				
			||||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
					  int e2=rhs._grid->_slice_block[dimension];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( cbmask == 0x3 ) { 
 | 
				
			||||||
PARALLEL_NESTED_LOOP2
 | 
					PARALLEL_NESTED_LOOP2
 | 
				
			||||||
  for(int n=0;n<e1;n++){
 | 
					    for(int n=0;n<e1;n++){
 | 
				
			||||||
    for(int b=0;b<e2;b++){
 | 
					      for(int b=0;b<e2;b++){
 | 
				
			||||||
      int o  = n*rhs._grid->_slice_stride[dimension];
 | 
						int o  = n*rhs._grid->_slice_stride[dimension];
 | 
				
			||||||
      int bo = n*rhs._grid->_slice_block[dimension];
 | 
						int bo = n*rhs._grid->_slice_block[dimension];
 | 
				
			||||||
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
					 | 
				
			||||||
      if ( ocb &cbmask ) {
 | 
					 | 
				
			||||||
	buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
						buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					     int bo=0;
 | 
				
			||||||
 | 
					     for(int n=0;n<e1;n++){
 | 
				
			||||||
 | 
					       for(int b=0;b<e2;b++){
 | 
				
			||||||
 | 
						 int o  = n*rhs._grid->_slice_stride[dimension];
 | 
				
			||||||
 | 
						 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
				
			||||||
 | 
						 if ( ocb &cbmask ) {
 | 
				
			||||||
 | 
						   buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
				
			||||||
 | 
						 }
 | 
				
			||||||
 | 
					       }
 | 
				
			||||||
 | 
					     }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -59,18 +70,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
					  int e1=rhs._grid->_slice_nblock[dimension];
 | 
				
			||||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
					  int e2=rhs._grid->_slice_block[dimension];
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if ( cbmask ==0x3){
 | 
				
			||||||
PARALLEL_NESTED_LOOP2
 | 
					PARALLEL_NESTED_LOOP2
 | 
				
			||||||
  for(int n=0;n<e1;n++){
 | 
					    for(int n=0;n<e1;n++){
 | 
				
			||||||
    for(int b=0;b<e2;b++){
 | 
					      for(int b=0;b<e2;b++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      int o=n*rhs._grid->_slice_stride[dimension];
 | 
						int o=n*rhs._grid->_slice_stride[dimension];
 | 
				
			||||||
      int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
						int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 | 
						cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
				
			||||||
      if ( ocb & cbmask ) {
 | 
					 | 
				
			||||||
	cobj temp; 
 | 
					 | 
				
			||||||
	temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
					 | 
				
			||||||
	extract<cobj>(temp,pointers,offset);
 | 
						extract<cobj>(temp,pointers,offset);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert(0); //Fixme think this is buggy
 | 
				
			||||||
 | 
					    for(int n=0;n<e1;n++){
 | 
				
			||||||
 | 
					      for(int b=0;b<e2;b++){
 | 
				
			||||||
 | 
						int o=n*rhs._grid->_slice_stride[dimension];
 | 
				
			||||||
 | 
						int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 | 
				
			||||||
 | 
						int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if ( ocb & cbmask ) {
 | 
				
			||||||
 | 
						  cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
 | 
				
			||||||
 | 
						  extract<cobj>(temp,pointers,offset);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@@ -109,16 +135,28 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<v
 | 
				
			|||||||
    
 | 
					    
 | 
				
			||||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
					  int e1=rhs._grid->_slice_nblock[dimension];
 | 
				
			||||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
					  int e2=rhs._grid->_slice_block[dimension];
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if ( cbmask ==0x3 ) {
 | 
				
			||||||
PARALLEL_NESTED_LOOP2
 | 
					PARALLEL_NESTED_LOOP2
 | 
				
			||||||
  for(int n=0;n<e1;n++){
 | 
					    for(int n=0;n<e1;n++){
 | 
				
			||||||
    for(int b=0;b<e2;b++){
 | 
					      for(int b=0;b<e2;b++){
 | 
				
			||||||
      int o   =n*rhs._grid->_slice_stride[dimension];
 | 
						int o   =n*rhs._grid->_slice_stride[dimension];
 | 
				
			||||||
      int bo  =n*rhs._grid->_slice_block[dimension];
 | 
						int bo  =n*rhs._grid->_slice_block[dimension];
 | 
				
			||||||
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
					 | 
				
			||||||
      if ( ocb & cbmask ) {
 | 
					 | 
				
			||||||
	rhs._odata[so+o+b]=buffer[bo+b];
 | 
						rhs._odata[so+o+b]=buffer[bo+b];
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    int bo=0;
 | 
				
			||||||
 | 
					    for(int n=0;n<e1;n++){
 | 
				
			||||||
 | 
					      for(int b=0;b<e2;b++){
 | 
				
			||||||
 | 
						int o   =n*rhs._grid->_slice_stride[dimension];
 | 
				
			||||||
 | 
						int bo  =n*rhs._grid->_slice_block[dimension];
 | 
				
			||||||
 | 
						int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
				
			||||||
 | 
						if ( ocb & cbmask ) {
 | 
				
			||||||
 | 
						  rhs._odata[so+o+b]=buffer[bo++];
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -137,16 +175,28 @@ PARALLEL_NESTED_LOOP2
 | 
				
			|||||||
    
 | 
					    
 | 
				
			||||||
  int e1=rhs._grid->_slice_nblock[dimension];
 | 
					  int e1=rhs._grid->_slice_nblock[dimension];
 | 
				
			||||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
					  int e2=rhs._grid->_slice_block[dimension];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if(cbmask ==0x3 ) {
 | 
				
			||||||
PARALLEL_NESTED_LOOP2
 | 
					PARALLEL_NESTED_LOOP2
 | 
				
			||||||
  for(int n=0;n<e1;n++){
 | 
					    for(int n=0;n<e1;n++){
 | 
				
			||||||
    for(int b=0;b<e2;b++){
 | 
					      for(int b=0;b<e2;b++){
 | 
				
			||||||
      int o      = n*rhs._grid->_slice_stride[dimension];
 | 
						int o      = n*rhs._grid->_slice_stride[dimension];
 | 
				
			||||||
      int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
						int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
				
			||||||
      int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 | 
					 | 
				
			||||||
      if ( ocb&cbmask ) {
 | 
					 | 
				
			||||||
	merge(rhs._odata[so+o+b],pointers,offset);
 | 
						merge(rhs._odata[so+o+b],pointers,offset);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    assert(0); // think this is buggy FIXME
 | 
				
			||||||
 | 
					    for(int n=0;n<e1;n++){
 | 
				
			||||||
 | 
					      for(int b=0;b<e2;b++){
 | 
				
			||||||
 | 
						int o      = n*rhs._grid->_slice_stride[dimension];
 | 
				
			||||||
 | 
						int offset = b+n*rhs._grid->_slice_block[dimension];
 | 
				
			||||||
 | 
						int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
 | 
				
			||||||
 | 
						if ( ocb&cbmask ) {
 | 
				
			||||||
 | 
						  merge(rhs._odata[so+o+b],pointers,offset);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -166,17 +216,29 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
 | 
					  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
 | 
				
			||||||
  int e2=rhs._grid->_slice_block[dimension];
 | 
					  int e2=rhs._grid->_slice_block[dimension];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if(cbmask == 0x3 ){
 | 
				
			||||||
PARALLEL_NESTED_LOOP2
 | 
					PARALLEL_NESTED_LOOP2
 | 
				
			||||||
  for(int n=0;n<e1;n++){
 | 
					    for(int n=0;n<e1;n++){
 | 
				
			||||||
    for(int b=0;b<e2;b++){
 | 
					      for(int b=0;b<e2;b++){
 | 
				
			||||||
 
 | 
					 
 | 
				
			||||||
      int o =n*rhs._grid->_slice_stride[dimension]+b;
 | 
					        int o =n*rhs._grid->_slice_stride[dimension]+b;
 | 
				
			||||||
      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
 | 
					  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 | 
				
			||||||
      if ( ocb&cbmask ) {
 | 
					 | 
				
			||||||
	//lhs._odata[lo+o]=rhs._odata[ro+o];
 | 
					 | 
				
			||||||
	vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
 | 
						vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					PARALLEL_NESTED_LOOP2
 | 
				
			||||||
 | 
					    for(int n=0;n<e1;n++){
 | 
				
			||||||
 | 
					      for(int b=0;b<e2;b++){
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
 | 
					        int o =n*rhs._grid->_slice_stride[dimension]+b;
 | 
				
			||||||
 | 
					        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
 | 
				
			||||||
 | 
					        if ( ocb&cbmask ) {
 | 
				
			||||||
 | 
					  	//lhs._odata[lo+o]=rhs._odata[ro+o];
 | 
				
			||||||
 | 
						  vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -9,7 +9,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
 | 
				
			|||||||
  typedef typename vobj::vector_type vector_type;
 | 
					  typedef typename vobj::vector_type vector_type;
 | 
				
			||||||
  typedef typename vobj::scalar_type scalar_type;
 | 
					  typedef typename vobj::scalar_type scalar_type;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  Lattice<vobj> ret(rhs._grid);
 | 
					  Lattice<vobj> ret(rhs._grid); 
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  int fd = rhs._grid->_fdimensions[dimension];
 | 
					  int fd = rhs._grid->_fdimensions[dimension];
 | 
				
			||||||
  int rd = rhs._grid->_rdimensions[dimension];
 | 
					  int rd = rhs._grid->_rdimensions[dimension];
 | 
				
			||||||
@@ -26,10 +26,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( !comm_dim ) {
 | 
					  if ( !comm_dim ) {
 | 
				
			||||||
 | 
					    //    std::cout << "Cshift_local" <<std::endl;
 | 
				
			||||||
    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
 | 
					    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
 | 
				
			||||||
  } else if ( splice_dim ) {
 | 
					  } else if ( splice_dim ) {
 | 
				
			||||||
 | 
					    //    std::cout << "Cshift_comms_simd" <<std::endl;
 | 
				
			||||||
    Cshift_comms_simd(ret,rhs,dimension,shift);
 | 
					    Cshift_comms_simd(ret,rhs,dimension,shift);
 | 
				
			||||||
  } else {
 | 
					  } else {
 | 
				
			||||||
 | 
					    //    std::cout << "Cshift_comms" <<std::endl;
 | 
				
			||||||
    Cshift_comms(ret,rhs,dimension,shift);
 | 
					    Cshift_comms(ret,rhs,dimension,shift);
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  return ret;
 | 
					  return ret;
 | 
				
			||||||
@@ -42,9 +45,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
 | 
				
			|||||||
  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
 | 
					  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
 | 
				
			||||||
  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
 | 
					  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( sshift[0] == sshift[1] ) {
 | 
					  if ( sshift[0] == sshift[1] ) {
 | 
				
			||||||
 | 
					    //    std::cout << "Single pass Cshift_comms" <<std::endl;
 | 
				
			||||||
    Cshift_comms(ret,rhs,dimension,shift,0x3);
 | 
					    Cshift_comms(ret,rhs,dimension,shift,0x3);
 | 
				
			||||||
  } else {
 | 
					  } else {
 | 
				
			||||||
 | 
					    //    std::cout << "Two pass Cshift_comms" <<std::endl;
 | 
				
			||||||
    Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 | 
					    Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 | 
				
			||||||
    Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
 | 
					    Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
@@ -113,12 +120,16 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
 | 
				
			|||||||
      int xmit_to_rank;
 | 
					      int xmit_to_rank;
 | 
				
			||||||
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
 | 
					      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      grid->SendToRecvFrom((void *)&send_buf[0],
 | 
					      grid->SendToRecvFrom((void *)&send_buf[0],
 | 
				
			||||||
			   xmit_to_rank,
 | 
								   xmit_to_rank,
 | 
				
			||||||
			   (void *)&recv_buf[0],
 | 
								   (void *)&recv_buf[0],
 | 
				
			||||||
			   recv_from_rank,
 | 
								   recv_from_rank,
 | 
				
			||||||
			   bytes);
 | 
								   bytes);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      //      for(int i=0;i<words;i++){
 | 
				
			||||||
 | 
					      //	std::cout << "SendRecv ["<<i<<"] snd "<<send_buf[i]<<" rcv " << recv_buf[i] << "  0x" << cbmask<<std::endl;
 | 
				
			||||||
 | 
					      //      }
 | 
				
			||||||
      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
 | 
					      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -212,9 +212,10 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    // Constructor requires "grid" passed.
 | 
					    // Constructor requires "grid" passed.
 | 
				
			||||||
    // what about a default grid?
 | 
					    // what about a default grid?
 | 
				
			||||||
    //////////////////////////////////////////////////////////////////
 | 
					    //////////////////////////////////////////////////////////////////
 | 
				
			||||||
 Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
 | 
					    Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
 | 
				
			||||||
      //        _odata.reserve(_grid->oSites());
 | 
					      //        _odata.reserve(_grid->oSites());
 | 
				
			||||||
      //        _odata.resize(_grid->oSites());
 | 
					      //        _odata.resize(_grid->oSites());
 | 
				
			||||||
 | 
					    //      std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
 | 
				
			||||||
        assert((((uint64_t)&_odata[0])&0xF) ==0);
 | 
					        assert((((uint64_t)&_odata[0])&0xF) ==0);
 | 
				
			||||||
        checkerboard=0;
 | 
					        checkerboard=0;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,11 +1,10 @@
 | 
				
			|||||||
#ifndef GRID_BINARY_IO_H
 | 
					#ifndef GRID_BINARY_IO_H
 | 
				
			||||||
#define GRID_BINARY_IO_H
 | 
					#define GRID_BINARY_IO_H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifdef HAVE_ENDIAN_H
 | 
					#ifdef HAVE_ENDIAN_H
 | 
				
			||||||
#include <endian.h>
 | 
					#include <endian.h>
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#include <arpa/inet.h>
 | 
					#include <arpa/inet.h>
 | 
				
			||||||
#include <algorithm>
 | 
					#include <algorithm>
 | 
				
			||||||
// 64bit endian swap is a portability pain
 | 
					// 64bit endian swap is a portability pain
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -26,7 +26,7 @@ namespace Grid {
 | 
				
			|||||||
    // and Methods:
 | 
					    // and Methods:
 | 
				
			||||||
    //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 | 
					    //    void ImportGauge(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 | 
				
			||||||
    //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 | 
					    //    void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 | 
				
			||||||
    //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St)
 | 
					    //    void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
 | 
				
			||||||
    //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
 | 
					    //    void InsertForce4D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
 | 
				
			||||||
    //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
 | 
					    //    void InsertForce5D(GaugeField &mat,const FermionField &Btilde,const FermionField &A,int mu)
 | 
				
			||||||
    //
 | 
					    //
 | 
				
			||||||
@@ -101,6 +101,7 @@ namespace Grid {
 | 
				
			|||||||
    typedef typename Impl::SiteSpinor               SiteSpinor;		\
 | 
					    typedef typename Impl::SiteSpinor               SiteSpinor;		\
 | 
				
			||||||
    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
 | 
					    typedef typename Impl::SiteHalfSpinor       SiteHalfSpinor;		\
 | 
				
			||||||
    typedef typename Impl::Compressor               Compressor;		\
 | 
					    typedef typename Impl::Compressor               Compressor;		\
 | 
				
			||||||
 | 
					    typedef typename Impl::StencilImpl              StencilImpl;	\
 | 
				
			||||||
    typedef typename Impl::ImplParams ImplParams;
 | 
					    typedef typename Impl::ImplParams ImplParams;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ///////
 | 
					    ///////
 | 
				
			||||||
@@ -112,7 +113,6 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      typedef ImplGauge<S,Nrepresentation> Gimpl;
 | 
					      typedef ImplGauge<S,Nrepresentation> Gimpl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
      INHERIT_GIMPL_TYPES(Gimpl);
 | 
					      INHERIT_GIMPL_TYPES(Gimpl);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
 | 
					      template<typename vtype> using iImplSpinor             = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
 | 
				
			||||||
@@ -128,10 +128,11 @@ namespace Grid {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
 | 
					      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
 | 
				
			||||||
      typedef WilsonImplParams ImplParams;
 | 
					      typedef WilsonImplParams ImplParams;
 | 
				
			||||||
 | 
					      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
 | 
				
			||||||
      ImplParams Params;
 | 
					      ImplParams Params;
 | 
				
			||||||
      WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
 | 
					      WilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
 | 
					      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
 | 
				
			||||||
        mult(&phi(),&U(mu),&chi());
 | 
					        mult(&phi(),&U(mu),&chi());
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -198,13 +199,15 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
 | 
					      typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
 | 
					      typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
 | 
				
			||||||
 | 
					      typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      typedef GparityWilsonImplParams ImplParams;
 | 
					      typedef GparityWilsonImplParams ImplParams;
 | 
				
			||||||
      ImplParams Params;
 | 
					      ImplParams Params;
 | 
				
			||||||
      GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
 | 
					      GparityWilsonImpl(const ImplParams &p= ImplParams()) : Params(p) {}; 
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      // provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
 | 
					      // provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity
 | 
				
			||||||
      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,CartesianStencil &St){
 | 
					      inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	typedef SiteHalfSpinor vobj;
 | 
						typedef SiteHalfSpinor vobj;
 | 
				
			||||||
	typedef typename SiteHalfSpinor::scalar_object sobj;
 | 
						typedef typename SiteHalfSpinor::scalar_object sobj;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -109,7 +109,7 @@ namespace QCD {
 | 
				
			|||||||
  ///////////////////////////////////
 | 
					  ///////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  template<class Impl>
 | 
					  template<class Impl>
 | 
				
			||||||
  void WilsonFermion<Impl>::DerivInternal(CartesianStencil & st,
 | 
					  void WilsonFermion<Impl>::DerivInternal(StencilImpl & st,
 | 
				
			||||||
					  DoubledGaugeField & U,
 | 
										  DoubledGaugeField & U,
 | 
				
			||||||
					  GaugeField &mat,
 | 
										  GaugeField &mat,
 | 
				
			||||||
					  const FermionField &A,
 | 
										  const FermionField &A,
 | 
				
			||||||
@@ -123,7 +123,7 @@ namespace QCD {
 | 
				
			|||||||
    FermionField Atilde(B._grid);
 | 
					    FermionField Atilde(B._grid);
 | 
				
			||||||
    Atilde = A;
 | 
					    Atilde = A;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
 | 
					    st.HaloExchange(B,comm_buf,compressor);
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    for(int mu=0;mu<Nd;mu++){
 | 
					    for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
@@ -242,7 +242,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
    
 | 
					    
 | 
				
			||||||
    Compressor compressor(dag);
 | 
					    Compressor compressor(dag);
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
 | 
					    Stencil.HaloExchange(in,comm_buf,compressor);
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
					      for(int sss=0;sss<in._grid->oSites();sss++){
 | 
				
			||||||
@@ -253,13 +253,13 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  template<class Impl>
 | 
					  template<class Impl>
 | 
				
			||||||
  void WilsonFermion<Impl>::DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
 | 
					  void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 | 
				
			||||||
					 const FermionField &in, FermionField &out,int dag) {
 | 
										 const FermionField &in, FermionField &out,int dag) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
					    assert((dag==DaggerNo) ||(dag==DaggerYes));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Compressor compressor(dag);
 | 
					    Compressor compressor(dag);
 | 
				
			||||||
    st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
 | 
					    st.HaloExchange(in,comm_buf,compressor);
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    if ( dag == DaggerYes ) {
 | 
					    if ( dag == DaggerYes ) {
 | 
				
			||||||
      if( HandOptDslash ) {
 | 
					      if( HandOptDslash ) {
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -73,14 +73,14 @@ namespace Grid {
 | 
				
			|||||||
      ///////////////////////////////////////////////////////////////
 | 
					      ///////////////////////////////////////////////////////////////
 | 
				
			||||||
      // Extra methods added by derived
 | 
					      // Extra methods added by derived
 | 
				
			||||||
      ///////////////////////////////////////////////////////////////
 | 
					      ///////////////////////////////////////////////////////////////
 | 
				
			||||||
      void DerivInternal(CartesianStencil & st,
 | 
					      void DerivInternal(StencilImpl & st,
 | 
				
			||||||
			 DoubledGaugeField & U,
 | 
								 DoubledGaugeField & U,
 | 
				
			||||||
			 GaugeField &mat,
 | 
								 GaugeField &mat,
 | 
				
			||||||
			 const FermionField &A,
 | 
								 const FermionField &A,
 | 
				
			||||||
			 const FermionField &B,
 | 
								 const FermionField &B,
 | 
				
			||||||
			 int dag);
 | 
								 int dag);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      void DhopInternal(CartesianStencil & st,DoubledGaugeField & U,
 | 
					      void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
 | 
				
			||||||
			const FermionField &in, FermionField &out,int dag) ;
 | 
								const FermionField &in, FermionField &out,int dag) ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -108,9 +108,9 @@ namespace Grid {
 | 
				
			|||||||
      GridBase                     *  _cbgrid;
 | 
					      GridBase                     *  _cbgrid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      //Defines the stencils for even and odd
 | 
					      //Defines the stencils for even and odd
 | 
				
			||||||
      CartesianStencil Stencil; 
 | 
					      StencilImpl Stencil; 
 | 
				
			||||||
      CartesianStencil StencilEven; 
 | 
					      StencilImpl StencilEven; 
 | 
				
			||||||
      CartesianStencil StencilOdd; 
 | 
					      StencilImpl StencilOdd; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      // Copy of the gauge field , with even and odd subsets
 | 
					      // Copy of the gauge field , with even and odd subsets
 | 
				
			||||||
      DoubledGaugeField Umu;
 | 
					      DoubledGaugeField Umu;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,4 +1,5 @@
 | 
				
			|||||||
#include <Grid.h>
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					#include <PerfCount.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
namespace QCD {
 | 
					namespace QCD {
 | 
				
			||||||
@@ -7,6 +8,7 @@ namespace QCD {
 | 
				
			|||||||
const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
 | 
					const std::vector<int> WilsonFermion5DStatic::directions   ({1,2,3,4, 1, 2, 3, 4});
 | 
				
			||||||
const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
 | 
					const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
 | 
				
			||||||
int WilsonFermion5DStatic::HandOptDslash;
 | 
					int WilsonFermion5DStatic::HandOptDslash;
 | 
				
			||||||
 | 
					int WilsonFermion5DStatic::AsmOptDslash;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // 5d lattice for DWF.
 | 
					  // 5d lattice for DWF.
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
@@ -68,6 +70,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
 | 
				
			|||||||
  comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
 | 
					  comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ImportGauge(_Umu);
 | 
					  ImportGauge(_Umu);
 | 
				
			||||||
 | 
					  commtime=0;
 | 
				
			||||||
 | 
					  dslashtime=0;
 | 
				
			||||||
}  
 | 
					}  
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 | 
					void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
 | 
				
			||||||
@@ -85,7 +89,7 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
 | 
				
			|||||||
  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
 | 
					  //  assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  Compressor compressor(DaggerNo);
 | 
					  Compressor compressor(DaggerNo);
 | 
				
			||||||
  Stencil.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
 | 
					  Stencil.HaloExchange(in,comm_buf,compressor);
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  int skip = (disp==1) ? 0 : 1;
 | 
					  int skip = (disp==1) ? 0 : 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -105,7 +109,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
 | 
					void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
 | 
				
			||||||
					  DoubledGaugeField & U,
 | 
										  DoubledGaugeField & U,
 | 
				
			||||||
					  GaugeField &mat,
 | 
										  GaugeField &mat,
 | 
				
			||||||
					  const FermionField &A,
 | 
										  const FermionField &A,
 | 
				
			||||||
@@ -122,7 +126,7 @@ void WilsonFermion5D<Impl>::DerivInternal(CartesianStencil & st,
 | 
				
			|||||||
  FermionField Btilde(B._grid);
 | 
					  FermionField Btilde(B._grid);
 | 
				
			||||||
  FermionField Atilde(B._grid);
 | 
					  FermionField Atilde(B._grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(B,comm_buf,compressor);
 | 
					  st.HaloExchange(B,comm_buf,compressor);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  Atilde=A;
 | 
					  Atilde=A;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -194,6 +198,27 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
 | 
				
			|||||||
  DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
 | 
					  DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class Impl>
 | 
				
			||||||
 | 
					void WilsonFermion5D<Impl>::Report(void)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Halo   time "<<commtime <<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil All    time "<<Stencil.halotime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil nosplice time "<<Stencil.nosplicetime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil gather time "<<Stencil.gathertime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil comm   time "<<Stencil.commtime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil scattertime "<<Stencil.scattertime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil splice time "<<Stencil.splicetime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil comm   time "<<Stencil.commstime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil gathremtime "<<Stencil.gathermtime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil merge  time "<<Stencil.mergetime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Stencil buf    time "<<Stencil.buftime<<" us"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "********************"<<std::endl;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 | 
					void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 | 
				
			||||||
				  const FermionField &A,
 | 
									  const FermionField &A,
 | 
				
			||||||
@@ -212,7 +237,7 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &lo,
 | 
					void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 | 
				
			||||||
					 DoubledGaugeField & U,
 | 
										 DoubledGaugeField & U,
 | 
				
			||||||
					 const FermionField &in, FermionField &out,int dag)
 | 
										 const FermionField &in, FermionField &out,int dag)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -220,19 +245,32 @@ void WilsonFermion5D<Impl>::DhopInternal(CartesianStencil & st, LebesgueOrder &l
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  Compressor compressor(dag);
 | 
					  Compressor compressor(dag);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  st.HaloExchange<SiteSpinor,SiteHalfSpinor,Compressor>(in,comm_buf,compressor);
 | 
					  // Assume balanced KMP_AFFINITY; this is forced in GridThread.h
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int threads = GridThread::GetThreads();
 | 
				
			||||||
 | 
					  int HT      = GridThread::GetHyperThreads();
 | 
				
			||||||
 | 
					  int cores   = GridThread::GetCores();
 | 
				
			||||||
 | 
					  int nwork = U._grid->oSites();
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  commtime -=usecond();
 | 
				
			||||||
 | 
					  st.HaloExchange(in,comm_buf,compressor);
 | 
				
			||||||
 | 
					  commtime +=usecond();
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
					  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
				
			||||||
  // Not loop ordering and data layout.
 | 
					  // Not loop ordering and data layout.
 | 
				
			||||||
  // Designed to create 
 | 
					  // Designed to create 
 | 
				
			||||||
  // - per thread reuse in L1 cache for U
 | 
					  // - per thread reuse in L1 cache for U
 | 
				
			||||||
  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 | 
					  // - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
 | 
				
			||||||
 | 
					  dslashtime -=usecond();
 | 
				
			||||||
  if ( dag == DaggerYes ) {
 | 
					  if ( dag == DaggerYes ) {
 | 
				
			||||||
    if( this->HandOptDslash ) {
 | 
					    if( this->HandOptDslash ) {
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					#pragma omp parallel for schedule(static)
 | 
				
			||||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
						for(int s=0;s<Ls;s++){
 | 
				
			||||||
	  int sU=ss;
 | 
						  int sU=ss;
 | 
				
			||||||
 | 
						  if (    LebesgueOrder::UseLebesgueOrder ) {
 | 
				
			||||||
 | 
						    sU=lo.Reorder(ss);
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
	  int sF = s+Ls*sU;
 | 
						  int sF = s+Ls*sU;
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
 | 
						  Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sF,sU,in,out);
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
@@ -251,16 +289,79 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  } else {
 | 
					  } else {
 | 
				
			||||||
    if( this->HandOptDslash ) {
 | 
					    if( this->AsmOptDslash ) {
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					      //      for(int i=0;i<1;i++){
 | 
				
			||||||
 | 
					      //      for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
				
			||||||
 | 
					      //	PerformanceCounter Counter(i);
 | 
				
			||||||
 | 
					      //	Counter.Start();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#pragma omp parallel for 
 | 
				
			||||||
 | 
					      for(int t=0;t<threads;t++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int hyperthread = t%HT;
 | 
				
			||||||
 | 
						int core        = t/HT;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        int sswork, swork,soff,ssoff,  sU,sF;
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						GridThread::GetWork(nwork,core,sswork,ssoff,cores);
 | 
				
			||||||
 | 
						GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for(int ss=0;ss<sswork;ss++){
 | 
				
			||||||
 | 
						  for(int s=soff;s<soff+swork;s++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						    sU=ss+ ssoff;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						    if ( LebesgueOrder::UseLebesgueOrder ) {
 | 
				
			||||||
 | 
						      sU = lo.Reorder(sU);
 | 
				
			||||||
 | 
						    }
 | 
				
			||||||
 | 
						    sF = s+Ls*sU;
 | 
				
			||||||
 | 
						    Kernels::DiracOptAsmDhopSite(st,U,comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      //      Counter.Stop();
 | 
				
			||||||
 | 
					      //      Counter.Report();
 | 
				
			||||||
 | 
					      //      }
 | 
				
			||||||
 | 
					    } else if( this->HandOptDslash ) {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#pragma omp parallel for 
 | 
				
			||||||
 | 
					      for(int t=0;t<threads;t++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int hyperthread = t%HT;
 | 
				
			||||||
 | 
						int core        = t/HT;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        int sswork, swork,soff, sU,sF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						sswork = (nwork + cores-1)/cores;
 | 
				
			||||||
 | 
						GridThread::GetWork(Ls   , hyperthread, swork, soff,HT);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for(int ss=0;ss<sswork;ss++){
 | 
				
			||||||
 | 
						  sU=ss+core*sswork; // max locality within an L2 slice
 | 
				
			||||||
 | 
						  if ( LebesgueOrder::UseLebesgueOrder ) {
 | 
				
			||||||
 | 
						    sU = lo.Reorder(sU);
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						  if ( sU < nwork ) {
 | 
				
			||||||
 | 
						    for(int s=soff;s<soff+swork;s++){
 | 
				
			||||||
 | 
						      sF = s+Ls*sU;
 | 
				
			||||||
 | 
						      Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 | 
				
			||||||
 | 
						    }
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      /*
 | 
				
			||||||
 | 
					#pragma omp parallel for schedule(static)
 | 
				
			||||||
      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
					      for(int ss=0;ss<U._grid->oSites();ss++){
 | 
				
			||||||
	for(int s=0;s<Ls;s++){
 | 
						for(int s=0;s<Ls;s++){
 | 
				
			||||||
	  //	  int sU=lo.Reorder(ss);
 | 
					 | 
				
			||||||
	  int sU=ss;
 | 
						  int sU=ss;
 | 
				
			||||||
 | 
						  if (    LebesgueOrder::UseLebesgueOrder ) {
 | 
				
			||||||
 | 
						    sU=lo.Reorder(ss);
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
	  int sF = s+Ls*sU;
 | 
						  int sF = s+Ls*sU;
 | 
				
			||||||
	  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 | 
						  Kernels::DiracOptHandDhopSite(st,U,comm_buf,sF,sU,in,out);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
 | 
					      */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    } else { 
 | 
					    } else { 
 | 
				
			||||||
PARALLEL_FOR_LOOP
 | 
					PARALLEL_FOR_LOOP
 | 
				
			||||||
@@ -274,6 +375,7 @@ PARALLEL_FOR_LOOP
 | 
				
			|||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					  dslashtime +=usecond();
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 | 
					void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -19,6 +19,7 @@ namespace Grid {
 | 
				
			|||||||
    class WilsonFermion5DStatic { 
 | 
					    class WilsonFermion5DStatic { 
 | 
				
			||||||
    public:
 | 
					    public:
 | 
				
			||||||
      // S-direction is INNERMOST and takes no part in the parity.
 | 
					      // S-direction is INNERMOST and takes no part in the parity.
 | 
				
			||||||
 | 
					      static int AsmOptDslash; // these are a temporary hack
 | 
				
			||||||
      static int HandOptDslash; // these are a temporary hack
 | 
					      static int HandOptDslash; // these are a temporary hack
 | 
				
			||||||
      static const std::vector<int> directions;
 | 
					      static const std::vector<int> directions;
 | 
				
			||||||
      static const std::vector<int> displacements;
 | 
					      static const std::vector<int> displacements;
 | 
				
			||||||
@@ -31,7 +32,8 @@ namespace Grid {
 | 
				
			|||||||
    public:
 | 
					    public:
 | 
				
			||||||
     INHERIT_IMPL_TYPES(Impl);
 | 
					     INHERIT_IMPL_TYPES(Impl);
 | 
				
			||||||
     typedef WilsonKernels<Impl> Kernels;
 | 
					     typedef WilsonKernels<Impl> Kernels;
 | 
				
			||||||
 | 
					     double commtime;
 | 
				
			||||||
 | 
					     double dslashtime;
 | 
				
			||||||
      ///////////////////////////////////////////////////////////////
 | 
					      ///////////////////////////////////////////////////////////////
 | 
				
			||||||
      // Implement the abstract base
 | 
					      // Implement the abstract base
 | 
				
			||||||
      ///////////////////////////////////////////////////////////////
 | 
					      ///////////////////////////////////////////////////////////////
 | 
				
			||||||
@@ -72,14 +74,14 @@ namespace Grid {
 | 
				
			|||||||
      ///////////////////////////////////////////////////////////////
 | 
					      ///////////////////////////////////////////////////////////////
 | 
				
			||||||
      // New methods added 
 | 
					      // New methods added 
 | 
				
			||||||
      ///////////////////////////////////////////////////////////////
 | 
					      ///////////////////////////////////////////////////////////////
 | 
				
			||||||
      void DerivInternal(CartesianStencil & st,
 | 
					      void DerivInternal(StencilImpl & st,
 | 
				
			||||||
			 DoubledGaugeField & U,
 | 
								 DoubledGaugeField & U,
 | 
				
			||||||
			 GaugeField &mat,
 | 
								 GaugeField &mat,
 | 
				
			||||||
			 const FermionField &A,
 | 
								 const FermionField &A,
 | 
				
			||||||
			 const FermionField &B,
 | 
								 const FermionField &B,
 | 
				
			||||||
			 int dag);
 | 
								 int dag);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      void DhopInternal(CartesianStencil & st,
 | 
					      void DhopInternal(StencilImpl & st,
 | 
				
			||||||
			LebesgueOrder &lo,
 | 
								LebesgueOrder &lo,
 | 
				
			||||||
			DoubledGaugeField &U,
 | 
								DoubledGaugeField &U,
 | 
				
			||||||
			const FermionField &in, 
 | 
								const FermionField &in, 
 | 
				
			||||||
@@ -97,6 +99,7 @@ namespace Grid {
 | 
				
			|||||||
      // DoubleStore
 | 
					      // DoubleStore
 | 
				
			||||||
      void ImportGauge(const GaugeField &_Umu);
 | 
					      void ImportGauge(const GaugeField &_Umu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      void Report(void);
 | 
				
			||||||
      ///////////////////////////////////////////////////////////////
 | 
					      ///////////////////////////////////////////////////////////////
 | 
				
			||||||
      // Data members require to support the functionality
 | 
					      // Data members require to support the functionality
 | 
				
			||||||
      ///////////////////////////////////////////////////////////////
 | 
					      ///////////////////////////////////////////////////////////////
 | 
				
			||||||
@@ -112,9 +115,9 @@ namespace Grid {
 | 
				
			|||||||
      int Ls;
 | 
					      int Ls;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      //Defines the stencils for even and odd
 | 
					      //Defines the stencils for even and odd
 | 
				
			||||||
      CartesianStencil Stencil; 
 | 
					      StencilImpl Stencil; 
 | 
				
			||||||
      CartesianStencil StencilEven; 
 | 
					      StencilImpl StencilEven; 
 | 
				
			||||||
      CartesianStencil StencilOdd; 
 | 
					      StencilImpl StencilOdd; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      // Copy of the gauge field , with even and odd subsets
 | 
					      // Copy of the gauge field , with even and odd subsets
 | 
				
			||||||
      DoubledGaugeField Umu;
 | 
					      DoubledGaugeField Umu;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -3,7 +3,7 @@ namespace Grid {
 | 
				
			|||||||
namespace QCD {
 | 
					namespace QCD {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl> 
 | 
					template<class Impl> 
 | 
				
			||||||
void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
						  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
											  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
						  int sF,int sU,const FermionField &in, FermionField &out)
 | 
											  int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -78,7 +78,7 @@ void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeFiel
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
  Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
 | 
					  Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
 | 
				
			||||||
  accumReconXm(result,Uchi);
 | 
					  accumReconXm(result,Uchi);
 | 
				
			||||||
  
 | 
					
 | 
				
			||||||
  // Ym
 | 
					  // Ym
 | 
				
			||||||
  SE=st.GetEntry(ptype,Ym,sF);
 | 
					  SE=st.GetEntry(ptype,Ym,sF);
 | 
				
			||||||
  if (  SE->_is_local && SE->_permute ) {
 | 
					  if (  SE->_is_local && SE->_permute ) {
 | 
				
			||||||
@@ -122,7 +122,7 @@ void WilsonKernels<Impl>::DiracOptDhopSite(CartesianStencil &st,DoubledGaugeFiel
 | 
				
			|||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl> 
 | 
					template<class Impl> 
 | 
				
			||||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
					      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
										      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
					      int sF,int sU,const FermionField &in, FermionField &out)
 | 
										      int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -241,7 +241,7 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeF
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl> 
 | 
					template<class Impl> 
 | 
				
			||||||
void WilsonKernels<Impl>::DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
					  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
										  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
					  int sF,int sU,const FermionField &in, FermionField &out,int dir,int gamma)
 | 
										  int sF,int sU,const FermionField &in, FermionField &out,int dir,int gamma)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -17,36 +17,47 @@ namespace Grid {
 | 
				
			|||||||
     typedef FermionOperator<Impl> Base;
 | 
					     typedef FermionOperator<Impl> Base;
 | 
				
			||||||
     
 | 
					     
 | 
				
			||||||
    public:
 | 
					    public:
 | 
				
			||||||
     void DiracOptDhopSite(CartesianStencil &st,DoubledGaugeField &U,
 | 
					     void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
			   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			   int sF,int sU,const FermionField &in, FermionField &out);
 | 
								   int sF,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
     void DiracOptDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
 | 
					     void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
			      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			      int sF,int sU,const FermionField &in,FermionField &out);
 | 
								      int sF,int sU,const FermionField &in,FermionField &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     void DiracOptDhopDir(CartesianStencil &st,DoubledGaugeField &U,
 | 
					     void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
			  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
 | 
								  int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
 | 
				
			||||||
 | 
					#if defined(AVX512) || defined(IMCI)
 | 
				
			||||||
 | 
					     void DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
								      int sF,int sU,const FermionField &in, FermionField &out,uint64_t *);
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					     void DiracOptAsmDhopSite(CartesianStencil &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
								      std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
								      int sF,int sU,const FermionField &in, FermionField &out,uint64_t *p){
 | 
				
			||||||
 | 
					       DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
				
			||||||
 | 
					     }
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
#define HANDOPT
 | 
					#define HANDOPT
 | 
				
			||||||
#ifdef HANDOPT
 | 
					#ifdef HANDOPT
 | 
				
			||||||
     void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
 | 
					     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
			       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			       int sF,int sU,const FermionField &in, FermionField &out);
 | 
								       int sF,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
 | 
					     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
				  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
									  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
				  int sF,int sU,const FermionField &in, FermionField &out);
 | 
									  int sF,int sU,const FermionField &in, FermionField &out);
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     void DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
 | 
					     void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
			       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
								       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
			       int sF,int sU,const FermionField &in, FermionField &out)
 | 
								       int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
     {
 | 
					     {
 | 
				
			||||||
       DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
					       DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
 | 
				
			||||||
     }
 | 
					     }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
     void DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
 | 
					     void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
				  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
									  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
				  int sF,int sU,const FermionField &in, FermionField &out)
 | 
									  int sF,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
     {
 | 
					     {
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										317
									
								
								lib/qcd/action/fermion/WilsonKernelsAsm.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										317
									
								
								lib/qcd/action/fermion/WilsonKernelsAsm.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,317 @@
 | 
				
			|||||||
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					#if defined(AVX512) || defined (IMCI)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <simd/Avx512Asm.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VLOAD
 | 
				
			||||||
 | 
					#undef VSTORE
 | 
				
			||||||
 | 
					#undef VMUL
 | 
				
			||||||
 | 
					#undef VMADD
 | 
				
			||||||
 | 
					#undef ZEND
 | 
				
			||||||
 | 
					#undef ZLOAD
 | 
				
			||||||
 | 
					#undef ZMUL
 | 
				
			||||||
 | 
					#undef ZMADD
 | 
				
			||||||
 | 
					#undef VZERO
 | 
				
			||||||
 | 
					#undef VTIMESI
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VZERO(A)                  VZEROf(A)
 | 
				
			||||||
 | 
					#define VMOV(A,B)                 VMOVf(A,B)
 | 
				
			||||||
 | 
					#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 | 
				
			||||||
 | 
					#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VADD(A,B,C)               VADDf(A,B,C)
 | 
				
			||||||
 | 
					#define VSUB(A,B,C)               VSUBf(A,B,C)
 | 
				
			||||||
 | 
					#define VMUL(Uri,Uir,Chi,UChi,Z)  VMULf(Uri,Uir,Chi,UChi,Z)
 | 
				
			||||||
 | 
					#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESI(A,B,C)            VTIMESIf(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI(A,B,C)       VTIMESMINUSIf(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI(A,B,C)         VACCTIMESIf(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI(A,B,C)    VACCTIMESMINUSIf(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESI0(A,B,C)            VTIMESI0f(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI0(A,B,C)       VTIMESMINUSI0f(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI0(A,B,C)         VACCTIMESI0f(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI0(A,B,C)    VACCTIMESMINUSI0f(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESI1(A,B,C)            VTIMESI1f(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI1(A,B,C)       VTIMESMINUSI1f(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI1(A,B,C)         VACCTIMESI1f(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1(A,B,C)    VACCTIMESMINUSI1f(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VTIMESI2(A,B,C)            VTIMESI2f(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI2(A,B,C)       VTIMESMINUSI2f(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESI2(A,B,C)         VACCTIMESI2f(A,B,C)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2(A,B,C)    VACCTIMESMINUSI2f(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
 | 
				
			||||||
 | 
					#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
 | 
				
			||||||
 | 
					#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VPERM0(A,B)               VPERM0f(A,B)
 | 
				
			||||||
 | 
					#define VPERM1(A,B)               VPERM1f(A,B)
 | 
				
			||||||
 | 
					#define VPERM2(A,B)               VPERM2f(A,B)
 | 
				
			||||||
 | 
					#define VPERM3(A,B)               VPERM3f(A,B)
 | 
				
			||||||
 | 
					#define VSHUFMEM(OFF,A,DEST)      VSHUFMEMf(OFF,A,DEST)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZEND1(A,B,C)               ZEND1f(A,B,C)
 | 
				
			||||||
 | 
					#define ZEND2(A,B,C)               ZEND2f(A,B,C)
 | 
				
			||||||
 | 
					#define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
 | 
				
			||||||
 | 
					#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 | 
				
			||||||
 | 
					#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 | 
				
			||||||
 | 
					#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VADDMEM(O,A,B,C)            VADDMEMf(O,A,B,C)
 | 
				
			||||||
 | 
					#define VSUBMEM(O,A,B,C)            VSUBMEMf(O,A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
				
			||||||
 | 
					#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					namespace QCD {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class Impl>
 | 
				
			||||||
 | 
					void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
 | 
											   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
 | 
										       int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  uint64_t  now;
 | 
				
			||||||
 | 
					  uint64_t first ;
 | 
				
			||||||
 | 
					  int offset,local,perm, ptype;
 | 
				
			||||||
 | 
					  const SiteHalfSpinor *pbuf = & buf[0];
 | 
				
			||||||
 | 
					  const SiteSpinor   *plocal = & in._odata[0];
 | 
				
			||||||
 | 
					  void *pf;
 | 
				
			||||||
 | 
					  int osites = in._grid->oSites();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  StencilEntry *SE;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //#define STAMP(i) timers[i] = __rdtsc() ; 
 | 
				
			||||||
 | 
					#define STAMP(i) //timers[i] = __rdtsc() ; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  MASK_REGS;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  first = __rdtsc();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Xm,ss);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if 0
 | 
				
			||||||
 | 
					  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
				
			||||||
 | 
					  else               pf=(void *)&pbuf[SE->_offset];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LOAD64(%r9,pf);
 | 
				
			||||||
 | 
					  __asm__( 
 | 
				
			||||||
 | 
						  VPREFETCH(0,%r9)
 | 
				
			||||||
 | 
						  VPREFETCH(1,%r9)
 | 
				
			||||||
 | 
						  VPREFETCH(2,%r9)
 | 
				
			||||||
 | 
						  VPREFETCH(3,%r9)
 | 
				
			||||||
 | 
						  VPREFETCH(4,%r9)
 | 
				
			||||||
 | 
						  VPREFETCH(5,%r9)
 | 
				
			||||||
 | 
						  VPREFETCH(6,%r9)
 | 
				
			||||||
 | 
						  VPREFETCH(7,%r9)
 | 
				
			||||||
 | 
						  VPREFETCH(8,%r9)
 | 
				
			||||||
 | 
						  VPREFETCH(9,%r9)
 | 
				
			||||||
 | 
						  VPREFETCH(10,%r9)
 | 
				
			||||||
 | 
						  VPREFETCH(11,%r9) );
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Xm
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Prefetch
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Ym,ss);
 | 
				
			||||||
 | 
					  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
				
			||||||
 | 
					  else               pf=(void *)&pbuf[SE->_offset];
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    XM_PROJMEM(&plocal[offset]);
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(&pbuf[offset]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFXM(Xm,pf);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  XM_RECON;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Ym
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Prefetch
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Zm,ss);
 | 
				
			||||||
 | 
					  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
				
			||||||
 | 
					  else               pf=(void *)&pbuf[SE->_offset];
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    YM_PROJMEM(&plocal[offset]);
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(&pbuf[offset]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFYM(Ym,pf);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  YM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Zm
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Prefetch
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Tm,ss);
 | 
				
			||||||
 | 
					  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
				
			||||||
 | 
					  else               pf=(void *)&pbuf[SE->_offset];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    ZM_PROJMEM(&plocal[offset]);
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(&pbuf[offset]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFZM(Zm,pf);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  ZM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Tm
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Tp,ss);
 | 
				
			||||||
 | 
					  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
				
			||||||
 | 
					  else               pf=(void *)&pbuf[SE->_offset];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    TM_PROJMEM(&plocal[offset]);
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(&pbuf[offset]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFTM(Tm,pf);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  TM_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Tp
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Prefetch
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Zp,ss);
 | 
				
			||||||
 | 
					  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
				
			||||||
 | 
					  else               pf=(void *)&pbuf[SE->_offset];
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    TP_PROJMEM(&plocal[offset]);
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(&pbuf[offset]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFTP(Tp,pf);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  TP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Zp
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Prefetch
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Yp,ss);
 | 
				
			||||||
 | 
					  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
				
			||||||
 | 
					  else               pf=(void *)&pbuf[SE->_offset];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    ZP_PROJMEM(&plocal[offset]);
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(&pbuf[offset]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFZP(Zp,pf);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  ZP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Prefetch
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Xp,ss);
 | 
				
			||||||
 | 
					  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
				
			||||||
 | 
					  else               pf=(void *)&pbuf[SE->_offset];
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    YP_PROJMEM(&plocal[offset]);
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(&pbuf[offset]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFYP(Yp,pf);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  YP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Xp
 | 
				
			||||||
 | 
					  perm   = SE->_permute;
 | 
				
			||||||
 | 
					  offset = SE->_offset;
 | 
				
			||||||
 | 
					  local  = SE->_is_local;
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					  //  PREFETCH_R(A);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Prefetch
 | 
				
			||||||
 | 
					  SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
 | 
				
			||||||
 | 
					  if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
 | 
				
			||||||
 | 
					  else               pf=(void *)&pbuf[SE->_offset];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( local ) {
 | 
				
			||||||
 | 
					    XP_PROJMEM(&plocal[offset]);
 | 
				
			||||||
 | 
					    if ( perm) {
 | 
				
			||||||
 | 
					      PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  } else { 
 | 
				
			||||||
 | 
					    LOAD_CHI(&pbuf[offset]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    MULT_2SPIN_DIR_PFXP(Xp,pf);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  XP_RECON_ACCUM;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 debug:
 | 
				
			||||||
 | 
					  SAVE_RESULT(&out._odata[ss]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  template class WilsonKernels<WilsonImplF>;		
 | 
				
			||||||
 | 
					  template class WilsonKernels<WilsonImplD>; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
@@ -282,7 +282,7 @@ namespace QCD {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#ifdef HANDOPT
 | 
					#ifdef HANDOPT
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
										       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
					       int ss,int sU,const FermionField &in, FermionField &out)
 | 
										       int ss,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
@@ -526,7 +526,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(CartesianStencil &st,DoubledGaug
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class Impl>
 | 
					template<class Impl>
 | 
				
			||||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(CartesianStencil &st,DoubledGaugeField &U,
 | 
					void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
 | 
				
			||||||
						   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
											   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 | 
				
			||||||
						   int ss,int sU,const FermionField &in, FermionField &out)
 | 
											   int ss,int sU,const FermionField &in, FermionField &out)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										1112
									
								
								lib/simd/Avx512Asm.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1112
									
								
								lib/simd/Avx512Asm.h
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -8,6 +8,9 @@
 | 
				
			|||||||
//----------------------------------------------------------------------
 | 
					//----------------------------------------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <immintrin.h>
 | 
					#include <immintrin.h>
 | 
				
			||||||
 | 
					#ifdef AVXFMA4
 | 
				
			||||||
 | 
					#include <x86intrin.h>
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
 | 
					// _mm256_set_m128i(hi,lo); // not defined in all versions of immintrin.h
 | 
				
			||||||
#ifndef _mm256_set_m128i
 | 
					#ifndef _mm256_set_m128i
 | 
				
			||||||
#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
 | 
					#define _mm256_set_m128i(hi,lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo),(hi),1)
 | 
				
			||||||
@@ -132,7 +135,7 @@ namespace Optimization {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
    //Integer
 | 
					    //Integer
 | 
				
			||||||
    inline __m256i operator()(__m256i a, __m256i b){
 | 
					    inline __m256i operator()(__m256i a, __m256i b){
 | 
				
			||||||
#if defined (AVX1) 
 | 
					#if defined (AVX1) || defined (AVXFMA4)
 | 
				
			||||||
          __m128i a0,a1;
 | 
					          __m128i a0,a1;
 | 
				
			||||||
          __m128i b0,b1;
 | 
					          __m128i b0,b1;
 | 
				
			||||||
          a0 = _mm256_extractf128_si256(a,0);
 | 
					          a0 = _mm256_extractf128_si256(a,0);
 | 
				
			||||||
@@ -146,7 +149,6 @@ namespace Optimization {
 | 
				
			|||||||
#if defined (AVX2)
 | 
					#if defined (AVX2)
 | 
				
			||||||
            return _mm256_add_epi32(a,b);
 | 
					            return _mm256_add_epi32(a,b);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -161,7 +163,7 @@ namespace Optimization {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
    //Integer
 | 
					    //Integer
 | 
				
			||||||
    inline __m256i operator()(__m256i a, __m256i b){
 | 
					    inline __m256i operator()(__m256i a, __m256i b){
 | 
				
			||||||
#if defined (AVX1) 
 | 
					#if defined (AVX1) || defined (AVXFMA4)
 | 
				
			||||||
          __m128i a0,a1;
 | 
					          __m128i a0,a1;
 | 
				
			||||||
          __m128i b0,b1;
 | 
					          __m128i b0,b1;
 | 
				
			||||||
          a0 = _mm256_extractf128_si256(a,0);
 | 
					          a0 = _mm256_extractf128_si256(a,0);
 | 
				
			||||||
@@ -182,6 +184,7 @@ namespace Optimization {
 | 
				
			|||||||
  struct MultComplex{
 | 
					  struct MultComplex{
 | 
				
			||||||
    // Complex float
 | 
					    // Complex float
 | 
				
			||||||
    inline __m256 operator()(__m256 a, __m256 b){
 | 
					    inline __m256 operator()(__m256 a, __m256 b){
 | 
				
			||||||
 | 
					#if defined (AVX1) 
 | 
				
			||||||
      __m256 ymm0,ymm1,ymm2;
 | 
					      __m256 ymm0,ymm1,ymm2;
 | 
				
			||||||
      ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
 | 
					      ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
 | 
				
			||||||
      ymm0 = _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
 | 
					      ymm0 = _mm256_mul_ps(ymm0,b);                       // ymm0 <- ar bi, ar br
 | 
				
			||||||
@@ -190,6 +193,20 @@ namespace Optimization {
 | 
				
			|||||||
      ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
 | 
					      ymm2 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ymm2 <- ai,ai
 | 
				
			||||||
      ymm1 = _mm256_mul_ps(ymm1,ymm2);                    // ymm1 <- br ai, ai bi
 | 
					      ymm1 = _mm256_mul_ps(ymm1,ymm2);                    // ymm1 <- br ai, ai bi
 | 
				
			||||||
      return _mm256_addsub_ps(ymm0,ymm1);  
 | 
					      return _mm256_addsub_ps(ymm0,ymm1);  
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#if defined (AVXFMA4)
 | 
				
			||||||
 | 
					      __m256 a_real = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ar ar,
 | 
				
			||||||
 | 
					      __m256 a_imag = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(3,3,1,1)); // ai ai
 | 
				
			||||||
 | 
					      __m256 tmp = _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1));
 | 
				
			||||||
 | 
					      a_imag = _mm256_mul_ps( a_imag,tmp  );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
 | 
				
			||||||
 | 
					      return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#if defined (AVX2)
 | 
				
			||||||
 | 
					      __m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
 | 
				
			||||||
 | 
					      __m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
 | 
				
			||||||
 | 
					      a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) ));  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
 | 
				
			||||||
 | 
					      return _mm256_fmaddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    // Complex double
 | 
					    // Complex double
 | 
				
			||||||
    inline __m256d operator()(__m256d a, __m256d b){
 | 
					    inline __m256d operator()(__m256d a, __m256d b){
 | 
				
			||||||
@@ -215,6 +232,7 @@ namespace Optimization {
 | 
				
			|||||||
	IF IMM0[3] = 0
 | 
						IF IMM0[3] = 0
 | 
				
			||||||
	THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i   ; 0xC unchanged
 | 
						THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i   ; 0xC unchanged
 | 
				
			||||||
      */
 | 
					      */
 | 
				
			||||||
 | 
					#if defined (AVX1) 
 | 
				
			||||||
      __m256d ymm0,ymm1,ymm2;
 | 
					      __m256d ymm0,ymm1,ymm2;
 | 
				
			||||||
      ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
 | 
					      ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
 | 
				
			||||||
      ymm0 = _mm256_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
 | 
					      ymm0 = _mm256_mul_pd(ymm0,b);      // ymm0 <- ar bi, ar br
 | 
				
			||||||
@@ -222,10 +240,71 @@ namespace Optimization {
 | 
				
			|||||||
      ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai  b'11,11
 | 
					      ymm2 = _mm256_shuffle_pd(a,a,0xF); // ymm2 <- ai,ai  b'11,11
 | 
				
			||||||
      ymm1 = _mm256_mul_pd(ymm1,ymm2);   // ymm1 <- br ai, ai bi
 | 
					      ymm1 = _mm256_mul_pd(ymm1,ymm2);   // ymm1 <- br ai, ai bi
 | 
				
			||||||
      return _mm256_addsub_pd(ymm0,ymm1);
 | 
					      return _mm256_addsub_pd(ymm0,ymm1);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#if defined (AVXFMA4)
 | 
				
			||||||
 | 
					      __m256d a_real = _mm256_shuffle_pd(a,a,0x0);//arar
 | 
				
			||||||
 | 
					      __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
 | 
				
			||||||
 | 
					      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
 | 
				
			||||||
 | 
					      return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#if defined (AVX2)
 | 
				
			||||||
 | 
					      __m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
 | 
				
			||||||
 | 
					      __m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
 | 
				
			||||||
 | 
					      a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) );  // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
 | 
				
			||||||
 | 
					      return _mm256_fmaddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi   +- Ai Bi             = ArBr-AiBi , ArBi+AiBr
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if 0
 | 
				
			||||||
 | 
					  struct ComplexDot {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    inline void Prep(__m256 ari,__m256 &air) {
 | 
				
			||||||
 | 
					      cdotRIperm(ari,air);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    inline void Mul(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) {
 | 
				
			||||||
 | 
					      riir=air*b;
 | 
				
			||||||
 | 
					      iirr=arr*b;
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					    inline void Madd(__m256 ari,__m256 air,__m256 b,__m256 &riir,__m256 &iirr) {
 | 
				
			||||||
 | 
					      mac(riir,air,b);
 | 
				
			||||||
 | 
					      mac(iirr,ari,b);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    inline void End(__m256 ari,__m256 &air) {
 | 
				
			||||||
 | 
					      //      cdotRI
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  struct Mult{
 | 
					  struct Mult{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    inline void mac(__m256 &a, __m256 b, __m256 c){
 | 
				
			||||||
 | 
					#if defined (AVX1)
 | 
				
			||||||
 | 
					      a= _mm256_add_ps(_mm256_mul_ps(b,c),a);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#if defined (AVXFMA4)
 | 
				
			||||||
 | 
					      a= _mm256_macc_ps(b,c,a);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#if defined (AVX2)
 | 
				
			||||||
 | 
					      a= _mm256_fmadd_ps( b, c, a);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    inline void mac(__m256d &a, __m256d b, __m256d c){
 | 
				
			||||||
 | 
					#if defined (AVX1)
 | 
				
			||||||
 | 
					      a= _mm256_add_pd(_mm256_mul_pd(b,c),a);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#if defined (AVXFMA4)
 | 
				
			||||||
 | 
					      a= _mm256_macc_pd(b,c,a);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#if defined (AVX2)
 | 
				
			||||||
 | 
					      a= _mm256_fmadd_pd( b, c, a);
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Real float
 | 
					    // Real float
 | 
				
			||||||
    inline __m256 operator()(__m256 a, __m256 b){
 | 
					    inline __m256 operator()(__m256 a, __m256 b){
 | 
				
			||||||
      return _mm256_mul_ps(a,b);
 | 
					      return _mm256_mul_ps(a,b);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -8,10 +8,7 @@
 | 
				
			|||||||
//----------------------------------------------------------------------
 | 
					//----------------------------------------------------------------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include <immintrin.h>
 | 
					#include <immintrin.h>
 | 
				
			||||||
#ifndef KNC_ONLY_STORES
 | 
					
 | 
				
			||||||
#define  _mm512_storenrngo_ps _mm512_store_ps  // not present in AVX512
 | 
					 | 
				
			||||||
#define  _mm512_storenrngo_pd _mm512_store_pd  // not present in AVX512
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace Optimization {
 | 
					namespace Optimization {
 | 
				
			||||||
@@ -59,14 +56,15 @@ namespace Optimization {
 | 
				
			|||||||
  struct Vstream{
 | 
					  struct Vstream{
 | 
				
			||||||
    //Float
 | 
					    //Float
 | 
				
			||||||
    inline void operator()(float * a, __m512 b){
 | 
					    inline void operator()(float * a, __m512 b){
 | 
				
			||||||
      _mm512_storenrngo_ps(a,b);
 | 
					      //_mm512_stream_ps(a,b);
 | 
				
			||||||
 | 
					      _mm512_store_ps(a,b);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    //Double
 | 
					    //Double
 | 
				
			||||||
    inline void operator()(double * a, __m512d b){
 | 
					    inline void operator()(double * a, __m512d b){
 | 
				
			||||||
      _mm512_storenrngo_pd(a,b);
 | 
					      //_mm512_stream_pd(a,b);
 | 
				
			||||||
 | 
					      _mm512_store_pd(a,b);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -180,6 +178,15 @@ namespace Optimization {
 | 
				
			|||||||
  };
 | 
					  };
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  struct Mult{
 | 
					  struct Mult{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    inline void mac(__m512 &a, __m512 b, __m512 c){         
 | 
				
			||||||
 | 
					       a= _mm512_fmadd_ps( b, c, a);                         
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    inline void mac(__m512d &a, __m512d b, __m512d c){
 | 
				
			||||||
 | 
					      a= _mm512_fmadd_pd( b, c, a);                   
 | 
				
			||||||
 | 
					    }                                             
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Real float
 | 
					    // Real float
 | 
				
			||||||
    inline __m512 operator()(__m512 a, __m512 b){
 | 
					    inline __m512 operator()(__m512 a, __m512 b){
 | 
				
			||||||
      return _mm512_mul_ps(a,b);
 | 
					      return _mm512_mul_ps(a,b);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -157,6 +157,12 @@ namespace Optimization {
 | 
				
			|||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  struct Mult{
 | 
					  struct Mult{
 | 
				
			||||||
 | 
					    inline float  mac(float a, float b,double c){
 | 
				
			||||||
 | 
					      return 0;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    inline double mac(double a, double b,double c){
 | 
				
			||||||
 | 
					      return 0;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
    // Real float
 | 
					    // Real float
 | 
				
			||||||
    inline float operator()(float a, float b){
 | 
					    inline float operator()(float a, float b){
 | 
				
			||||||
      return 0;
 | 
					      return 0;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -9,12 +9,6 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#include <immintrin.h>
 | 
					#include <immintrin.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//#ifndef KNC_ONLY_STORES
 | 
					 | 
				
			||||||
//#define  _mm512_storenrngo_ps _mm512_store_ps  // not present in AVX512
 | 
					 | 
				
			||||||
//#define  _mm512_storenrngo_pd _mm512_store_pd  // not present in AVX512
 | 
					 | 
				
			||||||
//#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
namespace Optimization {
 | 
					namespace Optimization {
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  struct Vsplat{
 | 
					  struct Vsplat{
 | 
				
			||||||
@@ -197,6 +191,15 @@ namespace Optimization {
 | 
				
			|||||||
  };
 | 
					  };
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  struct Mult{
 | 
					  struct Mult{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    inline void mac(__m512 &a, __m512 b, __m512 c){         
 | 
				
			||||||
 | 
					       a= _mm512_fmadd_ps( b, c, a);                         
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    inline void mac(__m512d &a, __m512d b, __m512d c){
 | 
				
			||||||
 | 
					      a= _mm512_fmadd_pd( b, c, a);                   
 | 
				
			||||||
 | 
					    }                                             
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Real float
 | 
					    // Real float
 | 
				
			||||||
    inline __m512 operator()(__m512 a, __m512 b){
 | 
					    inline __m512 operator()(__m512 a, __m512 b){
 | 
				
			||||||
      return _mm512_mul_ps(a,b);
 | 
					      return _mm512_mul_ps(a,b);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -171,6 +171,12 @@ namespace Optimization {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  struct Mult{
 | 
					  struct Mult{
 | 
				
			||||||
    // Real float
 | 
					    // Real float
 | 
				
			||||||
 | 
					    inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
 | 
				
			||||||
 | 
					      return vaddq_f32(vmulq_f32(b,c),a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
 | 
				
			||||||
 | 
					      return vaddq_f64(vmulq_f64(b,c),a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
 | 
					    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
 | 
				
			||||||
      return vmulq_f32(a,b);
 | 
					      return vmulq_f32(a,b);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -171,6 +171,15 @@ namespace Optimization {
 | 
				
			|||||||
  };
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  struct Mult{
 | 
					  struct Mult{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    inline void mac(__m128 &a, __m128 b, __m128 c){
 | 
				
			||||||
 | 
					      a= _mm128_add_ps(_mm128_mul_ps(b,c),a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    inline void mac(__m128d &a, __m128d b, __m128d c){
 | 
				
			||||||
 | 
					      a= _mm128_add_pd(_mm128_mul_pd(b,c),a);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // Real float
 | 
					    // Real float
 | 
				
			||||||
    inline __m128 operator()(__m128 a, __m128 b){
 | 
					    inline __m128 operator()(__m128 a, __m128 b){
 | 
				
			||||||
      return _mm_mul_ps(a,b);
 | 
					      return _mm_mul_ps(a,b);
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -13,7 +13,7 @@
 | 
				
			|||||||
#ifdef SSE4
 | 
					#ifdef SSE4
 | 
				
			||||||
#include "Grid_sse4.h"
 | 
					#include "Grid_sse4.h"
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#if defined (AVX1)|| defined (AVX2)
 | 
					#if defined (AVX1)|| defined (AVX2) || defined (AVXFMA4)
 | 
				
			||||||
#include "Grid_avx.h"
 | 
					#include "Grid_avx.h"
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#if defined AVX512
 | 
					#if defined AVX512
 | 
				
			||||||
@@ -133,7 +133,11 @@ namespace Grid {
 | 
				
			|||||||
    ///////////////////////////////////////////////
 | 
					    ///////////////////////////////////////////////
 | 
				
			||||||
    // mac, mult, sub, add, adj
 | 
					    // mac, mult, sub, add, adj
 | 
				
			||||||
    ///////////////////////////////////////////////
 | 
					    ///////////////////////////////////////////////
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // FIXME -- alias this to an inline MAC struct.
 | 
				
			||||||
    friend inline void mac (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ a,const Grid_simd *__restrict__ x){ *y = (*a)*(*x)+(*y); };
 | 
					    friend inline void mac (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ a,const Grid_simd *__restrict__ x){ *y = (*a)*(*x)+(*y); };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
 | 
					    friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
 | 
				
			||||||
    friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
 | 
					    friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
 | 
				
			||||||
    friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
 | 
					    friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,8 +1,10 @@
 | 
				
			|||||||
#include <Grid.h>
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					#include <algorithm>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int LebesgueOrder::UseLebesgueOrder;
 | 
					int LebesgueOrder::UseLebesgueOrder;
 | 
				
			||||||
 | 
					std::vector<int> LebesgueOrder::Block({2,2,2,2});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
 | 
					LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
 | 
				
			||||||
  n--;           // 1000 0011 --> 1000 0010
 | 
					  n--;           // 1000 0011 --> 1000 0010
 | 
				
			||||||
@@ -15,12 +17,79 @@ LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
 | 
				
			|||||||
  return n;
 | 
					  return n;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
LebesgueOrder::LebesgueOrder(GridBase *grid) 
 | 
					LebesgueOrder::LebesgueOrder(GridBase *_grid) 
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  grid = _grid;
 | 
				
			||||||
 | 
					  if ( Block[0]==0) ZGraph();
 | 
				
			||||||
 | 
					  else CartesianBlocking();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void LebesgueOrder::CartesianBlocking(void) 
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  _LebesgueReorder.resize(0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::cout << GridLogMessage << " CartesianBlocking ";
 | 
				
			||||||
 | 
					  for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
 | 
				
			||||||
 | 
					  std::cout<<std::endl; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  IndexInteger ND = grid->_ndimension;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  assert(ND==4);
 | 
				
			||||||
 | 
					  assert(ND==Block.size());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<IndexInteger> dims(ND);
 | 
				
			||||||
 | 
					  std::vector<IndexInteger> xo(ND,0);
 | 
				
			||||||
 | 
					  std::vector<IndexInteger> xi(ND,0);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(IndexInteger mu=0;mu<ND;mu++){
 | 
				
			||||||
 | 
					    dims[mu] = grid->_rdimensions[mu];
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  IterateO(ND,ND-1,xo,xi,dims);
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void LebesgueOrder::IterateO(int ND,int dim,
 | 
				
			||||||
 | 
						      std::vector<IndexInteger> & xo,
 | 
				
			||||||
 | 
						      std::vector<IndexInteger> & xi,
 | 
				
			||||||
 | 
						      std::vector<IndexInteger> &dims)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  for(xo[dim]=0;xo[dim]<dims[dim];xo[dim]+=Block[dim]){
 | 
				
			||||||
 | 
					    if ( dim > 0 ) {
 | 
				
			||||||
 | 
					      IterateO(ND,dim-1,xo,xi,dims);
 | 
				
			||||||
 | 
					    } else {
 | 
				
			||||||
 | 
					      IterateI(ND,ND-1,xo,xi,dims);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void LebesgueOrder::IterateI(int ND,
 | 
				
			||||||
 | 
						      int dim,
 | 
				
			||||||
 | 
						      std::vector<IndexInteger> & xo,
 | 
				
			||||||
 | 
						      std::vector<IndexInteger> & xi,
 | 
				
			||||||
 | 
						      std::vector<IndexInteger> &dims)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  std::vector<IndexInteger> x(ND);
 | 
				
			||||||
 | 
					  for(xi[dim]=0;xi[dim]<std::min(dims[dim]-xo[dim],Block[dim]);xi[dim]++){
 | 
				
			||||||
 | 
					    if ( dim > 0 ) {
 | 
				
			||||||
 | 
					      IterateI(ND,dim-1,xo,xi,dims);
 | 
				
			||||||
 | 
					    } else {
 | 
				
			||||||
 | 
					      for(int d=0;d<ND;d++){
 | 
				
			||||||
 | 
						x[d]=xi[d]+xo[d];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      IndexInteger index;
 | 
				
			||||||
 | 
					      grid->IndexFromCoor(x,index,grid->_rdimensions);
 | 
				
			||||||
 | 
					      _LebesgueReorder.push_back(index);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void LebesgueOrder::ZGraph(void) 
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  _LebesgueReorder.resize(0);
 | 
					  _LebesgueReorder.resize(0);
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  // Align up dimensions to power of two.
 | 
					  // Align up dimensions to power of two.
 | 
				
			||||||
  const IndexInteger one=1;
 | 
					  const IndexInteger one=1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  IndexInteger ND = grid->_ndimension;
 | 
					  IndexInteger ND = grid->_ndimension;
 | 
				
			||||||
  std::vector<IndexInteger> dims(ND);
 | 
					  std::vector<IndexInteger> dims(ND);
 | 
				
			||||||
  std::vector<IndexInteger> adims(ND);
 | 
					  std::vector<IndexInteger> adims(ND);
 | 
				
			||||||
@@ -36,17 +105,7 @@ LebesgueOrder::LebesgueOrder(GridBase *grid)
 | 
				
			|||||||
  // i) avoids recursion 
 | 
					  // i) avoids recursion 
 | 
				
			||||||
  // ii) has loop lengths at most the width of a 32 bit word.
 | 
					  // ii) has loop lengths at most the width of a 32 bit word.
 | 
				
			||||||
  int sitebit=0;
 | 
					  int sitebit=0;
 | 
				
			||||||
  int split=2;
 | 
					  for(int bit=0;bit<32;bit++){
 | 
				
			||||||
  for(int mu=0;mu<ND;mu++){   // mu 0 takes bit 0; mu 1 takes bit 1 etc...
 | 
					 | 
				
			||||||
    for(int bit=0;bit<split;bit++){
 | 
					 | 
				
			||||||
      IndexInteger mask = one<<bit;
 | 
					 | 
				
			||||||
      if ( mask&(adims[mu]-1) ){
 | 
					 | 
				
			||||||
	bitlist[mu].push_back(sitebit);
 | 
					 | 
				
			||||||
	sitebit++;
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  for(int bit=split;bit<32;bit++){
 | 
					 | 
				
			||||||
    IndexInteger mask = one<<bit;
 | 
					    IndexInteger mask = one<<bit;
 | 
				
			||||||
    for(int mu=0;mu<ND;mu++){   // mu 0 takes bit 0; mu 1 takes bit 1 etc...
 | 
					    for(int mu=0;mu<ND;mu++){   // mu 0 takes bit 0; mu 1 takes bit 1 etc...
 | 
				
			||||||
      if ( mask&(adims[mu]-1) ){
 | 
					      if ( mask&(adims[mu]-1) ){
 | 
				
			||||||
@@ -94,10 +153,22 @@ LebesgueOrder::LebesgueOrder(GridBase *grid)
 | 
				
			|||||||
	+        dims[0]*ax[1]
 | 
						+        dims[0]*ax[1]
 | 
				
			||||||
	+dims[0]*dims[1]*ax[2]
 | 
						+dims[0]*dims[1]*ax[2]
 | 
				
			||||||
	+dims[0]*dims[1]*dims[2]*ax[3];
 | 
						+dims[0]*dims[1]*dims[2]*ax[3];
 | 
				
			||||||
      
 | 
					
 | 
				
			||||||
 | 
					      assert(site < vol);
 | 
				
			||||||
      _LebesgueReorder.push_back(site);
 | 
					      _LebesgueReorder.push_back(site);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
	}
 | 
					  }
 | 
				
			||||||
  assert( _LebesgueReorder.size() == vol );
 | 
					  assert( _LebesgueReorder.size() == vol );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> coor(4);
 | 
				
			||||||
 | 
					  for(IndexInteger asite=0;asite<vol;asite++){
 | 
				
			||||||
 | 
					    grid->oCoorFromOindex (coor,_LebesgueReorder[asite]);
 | 
				
			||||||
 | 
					      std::cout << " site "<<asite << "->" << _LebesgueReorder[asite]<< " = ["
 | 
				
			||||||
 | 
							<< coor[0]<<","
 | 
				
			||||||
 | 
							<< coor[1]<<","
 | 
				
			||||||
 | 
							<< coor[2]<<","
 | 
				
			||||||
 | 
							<< coor[3]<<"]"
 | 
				
			||||||
 | 
							<<std::endl;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -9,17 +9,37 @@ namespace Grid {
 | 
				
			|||||||
  class LebesgueOrder { 
 | 
					  class LebesgueOrder { 
 | 
				
			||||||
  public:
 | 
					  public:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    typedef int32_t IndexInteger;
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    static int UseLebesgueOrder;
 | 
					    static int UseLebesgueOrder;
 | 
				
			||||||
 | 
					    GridBase *grid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    typedef uint32_t IndexInteger;
 | 
					  public:
 | 
				
			||||||
 | 
					    LebesgueOrder(GridBase *_grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    inline IndexInteger Reorder(IndexInteger ss) { 
 | 
					    inline IndexInteger Reorder(IndexInteger ss) { 
 | 
				
			||||||
      return UseLebesgueOrder ? _LebesgueReorder[ss] : ss; 
 | 
					      return _LebesgueReorder[ss] ;
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ////////////////////////////
 | 
				
			||||||
 | 
					    // Space filling fractal for cache oblivious
 | 
				
			||||||
 | 
					    ////////////////////////////
 | 
				
			||||||
 | 
					    void ZGraph(void);
 | 
				
			||||||
    IndexInteger alignup(IndexInteger n);
 | 
					    IndexInteger alignup(IndexInteger n);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    LebesgueOrder(GridBase *grid);
 | 
					    /////////////////////////////////
 | 
				
			||||||
 | 
					    // Cartesian stencil blocking strategy
 | 
				
			||||||
 | 
					    /////////////////////////////////
 | 
				
			||||||
 | 
					    static std::vector<int> Block;
 | 
				
			||||||
 | 
					    void CartesianBlocking(void);
 | 
				
			||||||
 | 
					    void IterateO(int ND,int dim,
 | 
				
			||||||
 | 
							  std::vector<IndexInteger> & xo,
 | 
				
			||||||
 | 
							  std::vector<IndexInteger> & xi,
 | 
				
			||||||
 | 
							  std::vector<IndexInteger> &dims);
 | 
				
			||||||
 | 
					    void IterateI(int ND,int dim,
 | 
				
			||||||
 | 
							  std::vector<IndexInteger> & xo,
 | 
				
			||||||
 | 
							  std::vector<IndexInteger> & xi,
 | 
				
			||||||
 | 
							  std::vector<IndexInteger> &dims);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  private:
 | 
					  private:
 | 
				
			||||||
    std::vector<IndexInteger> _LebesgueReorder;
 | 
					    std::vector<IndexInteger> _LebesgueReorder;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,268 +1,6 @@
 | 
				
			|||||||
#include "Grid.h"
 | 
					#include "Grid.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
namespace Grid {
 | 
					namespace Grid {
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  CartesianStencil::CartesianStencil(GridBase *grid,
 | 
					 | 
				
			||||||
				     int npoints,
 | 
					 | 
				
			||||||
				     int checkerboard,
 | 
					 | 
				
			||||||
				     const std::vector<int> &directions,
 | 
					 | 
				
			||||||
				     const std::vector<int> &distances) 
 | 
					 | 
				
			||||||
    :   _entries(npoints), _permute_type(npoints)
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      _npoints = npoints;
 | 
					 | 
				
			||||||
      _grid    = grid;
 | 
					 | 
				
			||||||
      _directions = directions;
 | 
					 | 
				
			||||||
      _distances  = distances;
 | 
					 | 
				
			||||||
      _unified_buffer_size=0;
 | 
					 | 
				
			||||||
      _request_count =0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      int osites  = _grid->oSites();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      for(int i=0;i<npoints;i++){
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	int point = i;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	_entries[i].resize( osites);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	int dimension    = directions[i];
 | 
					 | 
				
			||||||
	int displacement = distances[i];
 | 
					 | 
				
			||||||
	int shift = displacement;
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	int fd = _grid->_fdimensions[dimension];
 | 
					 | 
				
			||||||
	int rd = _grid->_rdimensions[dimension];
 | 
					 | 
				
			||||||
	_permute_type[point]=_grid->PermuteType(dimension);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	_checkerboard = checkerboard;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	// the permute type
 | 
					 | 
				
			||||||
	int simd_layout     = _grid->_simd_layout[dimension];
 | 
					 | 
				
			||||||
	int comm_dim        = _grid->_processors[dimension] >1 ;
 | 
					 | 
				
			||||||
	int splice_dim      = _grid->_simd_layout[dimension]>1 && (comm_dim);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	int sshift[2];
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	// Underlying approach. For each local site build
 | 
					 | 
				
			||||||
	// up a table containing the npoint "neighbours" and whether they 
 | 
					 | 
				
			||||||
	// live in lattice or a comms buffer.
 | 
					 | 
				
			||||||
	if ( !comm_dim ) {
 | 
					 | 
				
			||||||
	  sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
 | 
					 | 
				
			||||||
	  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	  if ( sshift[0] == sshift[1] ) {
 | 
					 | 
				
			||||||
	    Local(point,dimension,shift,0x3);
 | 
					 | 
				
			||||||
	  } else {
 | 
					 | 
				
			||||||
	    Local(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 | 
					 | 
				
			||||||
	    Local(point,dimension,shift,0x2);// both with block stride loop iteration
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
	} else { // All permute extract done in comms phase prior to Stencil application
 | 
					 | 
				
			||||||
	  //        So tables are the same whether comm_dim or splice_dim
 | 
					 | 
				
			||||||
	  sshift[0] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Even);
 | 
					 | 
				
			||||||
	  sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd);
 | 
					 | 
				
			||||||
	  if ( sshift[0] == sshift[1] ) {
 | 
					 | 
				
			||||||
	    Comms(point,dimension,shift,0x3);
 | 
					 | 
				
			||||||
	  } else {
 | 
					 | 
				
			||||||
	    Comms(point,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
 | 
					 | 
				
			||||||
	    Comms(point,dimension,shift,0x2);// both with block stride loop iteration
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    void CartesianStencil::Local     (int point, int dimension,int shiftpm,int cbmask)
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      int fd = _grid->_fdimensions[dimension];
 | 
					 | 
				
			||||||
      int rd = _grid->_rdimensions[dimension];
 | 
					 | 
				
			||||||
      int ld = _grid->_ldimensions[dimension];
 | 
					 | 
				
			||||||
      int gd = _grid->_gdimensions[dimension];
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
      // Map to always positive shift modulo global full dimension.
 | 
					 | 
				
			||||||
      int shift = (shiftpm+fd)%fd;
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
      // the permute type
 | 
					 | 
				
			||||||
      int permute_dim =_grid->PermuteDim(dimension);
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
      for(int x=0;x<rd;x++){       
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	int o   = 0;
 | 
					 | 
				
			||||||
	int bo  = x * _grid->_ostride[dimension];
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	int cb= (cbmask==0x2)? Odd : Even;
 | 
					 | 
				
			||||||
	  
 | 
					 | 
				
			||||||
	int sshift = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
 | 
					 | 
				
			||||||
	int sx     = (x+sshift)%rd;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	int wraparound=0;
 | 
					 | 
				
			||||||
	if ( (shiftpm==-1) && (sx>x)  ) {
 | 
					 | 
				
			||||||
	  wraparound = 1;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
	if ( (shiftpm== 1) && (sx<x)  ) {
 | 
					 | 
				
			||||||
	  wraparound = 1;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	  
 | 
					 | 
				
			||||||
	int permute_slice=0;
 | 
					 | 
				
			||||||
	if(permute_dim){
 | 
					 | 
				
			||||||
	  int wrap = sshift/rd;
 | 
					 | 
				
			||||||
	  int  num = sshift%rd;
 | 
					 | 
				
			||||||
	  if ( x< rd-num ) permute_slice=wrap;
 | 
					 | 
				
			||||||
	  else permute_slice = 1-wrap;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  	CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    void CartesianStencil::Comms     (int point,int dimension,int shiftpm,int cbmask)
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      GridBase *grid=_grid;
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
      int fd              = _grid->_fdimensions[dimension];
 | 
					 | 
				
			||||||
      int ld              = _grid->_ldimensions[dimension];
 | 
					 | 
				
			||||||
      int rd              = _grid->_rdimensions[dimension];
 | 
					 | 
				
			||||||
      int pd              = _grid->_processors[dimension];
 | 
					 | 
				
			||||||
      int simd_layout     = _grid->_simd_layout[dimension];
 | 
					 | 
				
			||||||
      int comm_dim        = _grid->_processors[dimension] >1 ;
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
      //      assert(simd_layout==1); // Why?
 | 
					 | 
				
			||||||
      assert(comm_dim==1);
 | 
					 | 
				
			||||||
      int shift = (shiftpm + fd) %fd;
 | 
					 | 
				
			||||||
      assert(shift>=0);
 | 
					 | 
				
			||||||
      assert(shift<fd);
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
      int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
 | 
					 | 
				
			||||||
      _comm_buf_size[point] = buffer_size; // Size of _one_ plane. Multiple planes may be gathered and
 | 
					 | 
				
			||||||
                                           // send to one or more remote nodes.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      int cb= (cbmask==0x2)? Odd : Even;
 | 
					 | 
				
			||||||
      int sshift= _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb);
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
      for(int x=0;x<rd;x++){       
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	int offnode = (((x+sshift)%fd) >= rd ); 
 | 
					 | 
				
			||||||
	//	int comm_proc   = ((x+sshift)/ld)%pd;        
 | 
					 | 
				
			||||||
	//	int offnode     = (comm_proc!=0);
 | 
					 | 
				
			||||||
	int sx          = (x+sshift)%rd;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	int wraparound=0;
 | 
					 | 
				
			||||||
	if ( (shiftpm==-1) && (sx>x) && (grid->_processor_coor[dimension]==0) ) {
 | 
					 | 
				
			||||||
	  wraparound = 1;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
	if ( (shiftpm== 1) && (sx<x) && (grid->_processor_coor[dimension]==grid->_processors[dimension]-1) ) {
 | 
					 | 
				
			||||||
	  wraparound = 1;
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
	if (!offnode) {
 | 
					 | 
				
			||||||
	  
 | 
					 | 
				
			||||||
	  int permute_slice=0;
 | 
					 | 
				
			||||||
	  CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound); 
 | 
					 | 
				
			||||||
	  
 | 
					 | 
				
			||||||
	} else {
 | 
					 | 
				
			||||||
	  
 | 
					 | 
				
			||||||
	  int words = buffer_size;
 | 
					 | 
				
			||||||
	  if (cbmask != 0x3) words=words>>1;
 | 
					 | 
				
			||||||
	  
 | 
					 | 
				
			||||||
	  //	  GatherPlaneSimple (point,dimension,sx,cbmask);
 | 
					 | 
				
			||||||
	  
 | 
					 | 
				
			||||||
	  int rank           = grid->_processor;
 | 
					 | 
				
			||||||
	  int recv_from_rank;
 | 
					 | 
				
			||||||
	  int xmit_to_rank;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	  int unified_buffer_offset = _unified_buffer_size;
 | 
					 | 
				
			||||||
	  _unified_buffer_size    += words;
 | 
					 | 
				
			||||||
	  ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset,wraparound); // permute/extract/merge is done in comms phase
 | 
					 | 
				
			||||||
	  
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  // Routine builds up integer table for each site in _offsets, _is_local, _permute
 | 
					 | 
				
			||||||
  void CartesianStencil::CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap)
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      int rd = _grid->_rdimensions[dimension];
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
      if ( !_grid->CheckerBoarded(dimension) ) {
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	int o   = 0;                                     // relative offset to base within plane
 | 
					 | 
				
			||||||
	int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
 | 
					 | 
				
			||||||
	int lo  = lplane*_grid->_ostride[dimension]; // offset in buffer
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	// Simple block stride gather of SIMD objects
 | 
					 | 
				
			||||||
	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 | 
					 | 
				
			||||||
	  for(int b=0;b<_grid->_slice_block[dimension];b++){
 | 
					 | 
				
			||||||
	    _entries[point][lo+o+b]._offset  =ro+o+b;
 | 
					 | 
				
			||||||
	    _entries[point][lo+o+b]._is_local=1;
 | 
					 | 
				
			||||||
	    _entries[point][lo+o+b]._permute=permute;
 | 
					 | 
				
			||||||
	    _entries[point][lo+o+b]._around_the_world=wrap;
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
	  o +=_grid->_slice_stride[dimension];
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
      } else {
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	int ro  = rplane*_grid->_ostride[dimension]; // base offset for start of plane 
 | 
					 | 
				
			||||||
	int lo  = lplane*_grid->_ostride[dimension]; // base offset for start of plane 
 | 
					 | 
				
			||||||
	int o   = 0;                                     // relative offset to base within plane
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 | 
					 | 
				
			||||||
	  for(int b=0;b<_grid->_slice_block[dimension];b++){
 | 
					 | 
				
			||||||
	    
 | 
					 | 
				
			||||||
	    int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
 | 
					 | 
				
			||||||
	    
 | 
					 | 
				
			||||||
	    if ( ocb&cbmask ) {
 | 
					 | 
				
			||||||
	      _entries[point][lo+o+b]._offset =ro+o+b;
 | 
					 | 
				
			||||||
	      _entries[point][lo+o+b]._is_local=1;
 | 
					 | 
				
			||||||
	      _entries[point][lo+o+b]._permute=permute;
 | 
					 | 
				
			||||||
	      _entries[point][lo+o+b]._around_the_world=wrap;
 | 
					 | 
				
			||||||
	    }
 | 
					 | 
				
			||||||
	    
 | 
					 | 
				
			||||||
	    }
 | 
					 | 
				
			||||||
	  o +=_grid->_slice_stride[dimension];
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
  // Routine builds up integer table for each site in _offsets, _is_local, _permute
 | 
					 | 
				
			||||||
   void CartesianStencil::ScatterPlane (int point,int dimension,int plane,int cbmask,int offset, int wrap)
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
      int rd = _grid->_rdimensions[dimension];
 | 
					 | 
				
			||||||
      
 | 
					 | 
				
			||||||
      if ( !_grid->CheckerBoarded(dimension) ) {
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
 | 
					 | 
				
			||||||
	int o   = 0;                                    // relative offset to base within plane
 | 
					 | 
				
			||||||
	int bo  = 0;                                    // offset in buffer
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	// Simple block stride gather of SIMD objects
 | 
					 | 
				
			||||||
	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 | 
					 | 
				
			||||||
	  for(int b=0;b<_grid->_slice_block[dimension];b++){
 | 
					 | 
				
			||||||
	    _entries[point][so+o+b]._offset  =offset+(bo++);
 | 
					 | 
				
			||||||
	    _entries[point][so+o+b]._is_local=0;
 | 
					 | 
				
			||||||
	    _entries[point][so+o+b]._permute=0;
 | 
					 | 
				
			||||||
	    _entries[point][so+o+b]._around_the_world=wrap;
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
	  o +=_grid->_slice_stride[dimension];
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
      } else { 
 | 
					 | 
				
			||||||
	
 | 
					 | 
				
			||||||
	int so  = plane*_grid->_ostride[dimension]; // base offset for start of plane 
 | 
					 | 
				
			||||||
	int o   = 0;                                      // relative offset to base within plane
 | 
					 | 
				
			||||||
	int bo  = 0;                                      // offset in buffer
 | 
					 | 
				
			||||||
    
 | 
					 | 
				
			||||||
	for(int n=0;n<_grid->_slice_nblock[dimension];n++){
 | 
					 | 
				
			||||||
	  for(int b=0;b<_grid->_slice_block[dimension];b++){
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	    int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
 | 
					 | 
				
			||||||
	    if ( ocb & cbmask ) {
 | 
					 | 
				
			||||||
	      _entries[point][so+o+b]._offset  =offset+(bo++);
 | 
					 | 
				
			||||||
	      _entries[point][so+o+b]._is_local=0;
 | 
					 | 
				
			||||||
	      _entries[point][so+o+b]._permute =0;
 | 
					 | 
				
			||||||
	      _entries[point][so+o+b]._around_the_world=wrap;
 | 
					 | 
				
			||||||
	    }
 | 
					 | 
				
			||||||
	  }
 | 
					 | 
				
			||||||
	  o +=_grid->_slice_stride[dimension];
 | 
					 | 
				
			||||||
	}
 | 
					 | 
				
			||||||
      }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -115,23 +115,21 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
 | 
				
			|||||||
template<class vobj> inline 
 | 
					template<class vobj> inline 
 | 
				
			||||||
void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extracted, int offset)
 | 
					void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extracted, int offset)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
					 | 
				
			||||||
  typedef typename vobj::scalar_type scalar_type ;
 | 
					  typedef typename vobj::scalar_type scalar_type ;
 | 
				
			||||||
  typedef typename vobj::vector_type vector_type ;
 | 
					  typedef typename vobj::vector_type vector_type ;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
					  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
				
			||||||
  const int Nsimd=vobj::vector_type::Nsimd();
 | 
					  const int Nsimd=vobj::vector_type::Nsimd();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int Nextr=extracted.size();
 | 
					  int Nextr=extracted.size();
 | 
				
			||||||
  int s = Nsimd/Nextr;
 | 
					  int s = Nsimd/Nextr;
 | 
				
			||||||
 | 
					  scalar_type * vp = (scalar_type *)&vec;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::vector<scalar_type *> pointers(Nsimd);
 | 
					 | 
				
			||||||
  for(int i=0;i<Nextr;i++) {
 | 
					 | 
				
			||||||
    pointers[i] =(scalar_type *)& extracted[i][offset];
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  vector_type *vp = (vector_type *)&vec;
 | 
					 | 
				
			||||||
  for(int w=0;w<words;w++){
 | 
					  for(int w=0;w<words;w++){
 | 
				
			||||||
    extract<vector_type,scalar_type>(&vp[w],pointers,w);
 | 
					    for(int i=0;i<Nextr;i++){
 | 
				
			||||||
 | 
					      scalar_type * pointer = (scalar_type *)& extracted[i][offset];
 | 
				
			||||||
 | 
					      pointer[w] = vp[i*s+w*Nsimd];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -173,16 +171,19 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
 | 
				
			|||||||
  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
					  const int words=sizeof(vobj)/sizeof(vector_type);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int Nextr=extracted.size();
 | 
					  int Nextr=extracted.size();
 | 
				
			||||||
 | 
					  int s=Nsimd/Nextr;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::vector<scalar_type *> pointers(Nextr);
 | 
					  scalar_type *pointer;
 | 
				
			||||||
  for(int i=0;i<Nextr;i++) 
 | 
					  scalar_type *vp = (scalar_type *)&vec;
 | 
				
			||||||
    pointers[i] =(scalar_type *)& extracted[i][offset];
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  vector_type *vp = (vector_type *)&vec;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  for(int w=0;w<words;w++){
 | 
					  for(int w=0;w<words;w++){
 | 
				
			||||||
    merge<vector_type,scalar_type>(&vp[w],pointers,w);
 | 
					    for(int i=0;i<Nextr;i++){
 | 
				
			||||||
 | 
					      for(int ii=0;ii<s;ii++){
 | 
				
			||||||
 | 
						pointer=(scalar_type *)&extracted[i][offset];
 | 
				
			||||||
 | 
						vp[w*Nsimd+i*s+ii] = pointer[w];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					 }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -86,16 +86,16 @@ Test_dwf_hdcr_SOURCES=Test_dwf_hdcr.cc
 | 
				
			|||||||
Test_dwf_hdcr_LDADD=-lGrid
 | 
					Test_dwf_hdcr_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
 | 
					Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
 | 
				
			||||||
#Test_dwf_lanczos_LDADD=-lGrid
 | 
					Test_dwf_lanczos_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Test_gamma_SOURCES=Test_gamma.cc
 | 
					Test_gamma_SOURCES=Test_gamma.cc
 | 
				
			||||||
Test_gamma_LDADD=-lGrid
 | 
					Test_gamma_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#Test_gparity_SOURCES=Test_gparity.cc
 | 
					Test_gparity_SOURCES=Test_gparity.cc
 | 
				
			||||||
#Test_gparity_LDADD=-lGrid
 | 
					Test_gparity_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
 | 
					#Test_gpwilson_even_odd_SOURCES=Test_gpwilson_even_odd.cc
 | 
				
			||||||
@@ -190,6 +190,10 @@ Test_stencil_SOURCES=Test_stencil.cc
 | 
				
			|||||||
Test_stencil_LDADD=-lGrid
 | 
					Test_stencil_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Test_synthetic_lanczos_SOURCES=Test_synthetic_lanczos.cc
 | 
				
			||||||
 | 
					Test_synthetic_lanczos_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Test_wilson_cg_prec_SOURCES=Test_wilson_cg_prec.cc
 | 
					Test_wilson_cg_prec_SOURCES=Test_wilson_cg_prec.cc
 | 
				
			||||||
Test_wilson_cg_prec_LDADD=-lGrid
 | 
					Test_wilson_cg_prec_LDADD=-lGrid
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -54,27 +54,27 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  TComplex cm;
 | 
					  TComplex cm;
 | 
				
			||||||
  for(int dir=0;dir<Nd;dir++){
 | 
					  for(int dir=0;dir<Nd;dir++){
 | 
				
			||||||
    if ( dir!=1 ) continue;
 | 
					    //    if ( dir!=1 ) continue;
 | 
				
			||||||
    for(int shift=0;shift<latt_size[dir];shift++){
 | 
					    for(int shift=0;shift<latt_size[dir];shift++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
 | 
						std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	//	std::cout<<GridLogMessage<<"Even grid"<<std::endl;
 | 
						std::cout<<GridLogMessage<<"Even grid"<<std::endl;
 | 
				
			||||||
	ShiftUe = Cshift(Ue,dir,shift);    // Shift everything cb by cb
 | 
						ShiftUe = Cshift(Ue,dir,shift);    // Shift everything cb by cb
 | 
				
			||||||
	//	std::cout<<GridLogMessage << "\tShiftUe " <<norm2(ShiftUe)<<std::endl;
 | 
						std::cout<<GridLogMessage << "\tShiftUe " <<norm2(ShiftUe)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	//	std::cout<<GridLogMessage<<"Odd grid"<<std::endl;
 | 
						std::cout<<GridLogMessage<<"Odd grid"<<std::endl;
 | 
				
			||||||
	ShiftUo = Cshift(Uo,dir,shift);    
 | 
						ShiftUo = Cshift(Uo,dir,shift);    
 | 
				
			||||||
	//	std::cout<<GridLogMessage << "\tShiftUo " <<norm2(ShiftUo)<<std::endl;
 | 
						std::cout<<GridLogMessage << "\tShiftUo " <<norm2(ShiftUo)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	//	std::cout<<GridLogMessage<<"Recombined Even/Odd grids"<<std::endl;
 | 
						std::cout<<GridLogMessage<<"Recombined Even/Odd grids"<<std::endl;
 | 
				
			||||||
	setCheckerboard(rbShiftU,ShiftUe);
 | 
						setCheckerboard(rbShiftU,ShiftUe);
 | 
				
			||||||
	setCheckerboard(rbShiftU,ShiftUo);
 | 
						setCheckerboard(rbShiftU,ShiftUo);
 | 
				
			||||||
	//	std::cout<<GridLogMessage << "\trbShiftU " <<norm2(rbShiftU)<<std::endl;
 | 
						std::cout<<GridLogMessage << "\trbShiftU " <<norm2(rbShiftU)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	//	std::cout<<GridLogMessage<<"Full grid shift"<<std::endl;
 | 
						std::cout<<GridLogMessage<<"Full grid shift"<<std::endl;
 | 
				
			||||||
	ShiftU  = Cshift(U,dir,shift);    // Shift everything
 | 
						ShiftU  = Cshift(U,dir,shift);    // Shift everything
 | 
				
			||||||
	//	std::cout<<GridLogMessage << "\tShiftU " <<norm2(rbShiftU)<<std::endl;
 | 
						std::cout<<GridLogMessage << "\tShiftU " <<norm2(rbShiftU)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	std::vector<int> coor(4);
 | 
						std::vector<int> coor(4);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -105,18 +105,18 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
	  Fine.CoorFromIndex(peer,index,latt_size);
 | 
						  Fine.CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  if (nrm > 0){
 | 
						  if (nrm > 0){
 | 
				
			||||||
	    std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir
 | 
						    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
 | 
				
			||||||
		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 | 
							     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 | 
				
			||||||
		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
							     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
				
			||||||
	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
						    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
	    index=real(scm);
 | 
						    index=real(scm);
 | 
				
			||||||
	    Fine.CoorFromIndex(peer,index,latt_size);
 | 
						    Fine.CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
	    std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
						    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
	    exit(-1);
 | 
						    exit(-1);
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}}}}
 | 
						}}}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int exx=0;
 | 
				
			||||||
	std::cout<<GridLogMessage << "Checking the checkerboard shift"<<std::endl;
 | 
						std::cout<<GridLogMessage << "Checking the checkerboard shift"<<std::endl;
 | 
				
			||||||
	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
 | 
						for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
 | 
				
			||||||
	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
 | 
						for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
 | 
				
			||||||
@@ -144,20 +144,21 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
	  Fine.CoorFromIndex(peer,index,latt_size);
 | 
						  Fine.CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  if (nrm > 0){
 | 
						  if (nrm > 0){
 | 
				
			||||||
	    std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir
 | 
						    std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
 | 
				
			||||||
		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 | 
							     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 | 
				
			||||||
		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
							     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
				
			||||||
	    std::cerr<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
						    std::cout<<"Got    "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
	    index=real(scm);
 | 
						    index=real(scm);
 | 
				
			||||||
	    Fine.CoorFromIndex(peer,index,latt_size);
 | 
						    Fine.CoorFromIndex(peer,index,latt_size);
 | 
				
			||||||
	    std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
						    std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
 | 
				
			||||||
	    exit(-1);
 | 
						    exx=1;
 | 
				
			||||||
	  } else if (0) { 
 | 
						  } else if (1) { 
 | 
				
			||||||
	    std::cout<<GridLogMessage<<"PASS shift "<< shift<<" in dir "<< dir
 | 
						    std::cout<<GridLogMessage<<"PASS shift "<< shift<<" in dir "<< dir
 | 
				
			||||||
		     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 | 
							     <<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
 | 
				
			||||||
		     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
							     << cm()()()<<" expect "<<scm<<"  "<<nrm<<std::endl;
 | 
				
			||||||
	  }
 | 
						  }
 | 
				
			||||||
	}}}}
 | 
						}}}}
 | 
				
			||||||
 | 
						if (exx) exit(-1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -35,21 +35,26 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
 | 
					  MdagMLinearOperator<DomainWallFermionR,LatticeFermion> HermOp(Ddwf);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  const int Nk = 10;
 | 
					  const int Nk = 30;
 | 
				
			||||||
  const int Np = 1;
 | 
					  const int Np = 10;
 | 
				
			||||||
  RealD enorm  = 1.0;
 | 
					  const int Nm = Nk+Np;
 | 
				
			||||||
  RealD vthrs  = 1;
 | 
					  const int MaxIt= 10000;
 | 
				
			||||||
  const int Nit= 1000;
 | 
					  RealD resid = 1.0e-8;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  ImplicitlyRestartedLanczos<LatticeFermion> IRL(HermOp,PolyX,
 | 
					  std::vector<double> Coeffs(1,1.0);
 | 
				
			||||||
						 Nk,Np,enorm,vthrs,Nit);
 | 
					  Polynomial<LatticeFermion> PolyX(Coeffs);
 | 
				
			||||||
 | 
					  ImplicitlyRestartedLanczos<LatticeFermion> IRL(HermOp,PolyX,Nk,Nm,resid,MaxIt);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  std::vector<RealD>          eval(Nk);
 | 
					  std::vector<RealD>          eval(Nm);
 | 
				
			||||||
  std::vector<LatticeFermion> evec(Nk,FGrid);
 | 
					  std::vector<LatticeFermion> evec(Nm,FGrid);
 | 
				
			||||||
 | 
					  for(int i=0;i<Nm;i++){
 | 
				
			||||||
 | 
					    std::cout << i<<" / "<< Nm<< " grid pointer "<<evec[i]._grid<<std::endl;
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int Nconv;
 | 
				
			||||||
  IRL.calc(eval,evec,
 | 
					  IRL.calc(eval,evec,
 | 
				
			||||||
	   src,
 | 
						   src,
 | 
				
			||||||
	   Nsbt,
 | 
					 | 
				
			||||||
	   Nconv);
 | 
						   Nconv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -298,7 +298,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
      c          = scm()(1,1)(1,2);
 | 
					      c          = scm()(1,1)(1,2);
 | 
				
			||||||
      scm()(1,1)(2,1) = c;
 | 
					      scm()(1,1)(2,1) = c;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      pokeIndex<ColourIndex> (c_m,c,0,0);
 | 
					      //      pokeIndex<ColourIndex> (c_m,c,0,0);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    FooBar = Bar;
 | 
					    FooBar = Bar;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -8,6 +8,10 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
{
 | 
					{
 | 
				
			||||||
  Grid_init(&argc,&argv);
 | 
					  Grid_init(&argc,&argv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  //  typedef LatticeColourMatrix Field;
 | 
				
			||||||
 | 
					  typedef LatticeComplex Field;
 | 
				
			||||||
 | 
					  typedef typename Field::vector_object vobj;
 | 
				
			||||||
 | 
					  typedef typename vobj::scalar_object sobj;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::vector<int> latt_size   = GridDefaultLatt();
 | 
					  std::vector<int> latt_size   = GridDefaultLatt();
 | 
				
			||||||
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
 | 
					  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
 | 
				
			||||||
@@ -18,23 +22,40 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  GridCartesian Fine(latt_size,simd_layout,mpi_layout);
 | 
					  GridCartesian Fine(latt_size,simd_layout,mpi_layout);
 | 
				
			||||||
  GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
 | 
					  GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
 | 
				
			||||||
  GridParallelRNG       fRNG(&Fine);
 | 
					  GridParallelRNG       fRNG(&Fine);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //  fRNG.SeedRandomDevice();
 | 
					  //  fRNG.SeedRandomDevice();
 | 
				
			||||||
  std::vector<int> seeds({1,2,3,4});
 | 
					  std::vector<int> seeds({1,2,3,4});
 | 
				
			||||||
  fRNG.SeedFixedIntegers(seeds);
 | 
					  fRNG.SeedFixedIntegers(seeds);
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  LatticeColourMatrix Foo(&Fine);
 | 
					  Field Foo(&Fine);
 | 
				
			||||||
  LatticeColourMatrix Bar(&Fine);
 | 
					  Field Bar(&Fine);
 | 
				
			||||||
  LatticeColourMatrix Check(&Fine);
 | 
					  Field Check(&Fine);
 | 
				
			||||||
  LatticeColourMatrix Diff(&Fine);
 | 
					  Field Diff(&Fine);
 | 
				
			||||||
  
 | 
					  LatticeComplex lex(&Fine);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  lex = zero;  
 | 
				
			||||||
  random(fRNG,Foo);
 | 
					  random(fRNG,Foo);
 | 
				
			||||||
  gaussian(fRNG,Bar);
 | 
					  gaussian(fRNG,Bar);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /*
 | 
				
			||||||
 | 
					  Integer stride =1000;
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    double nrm;
 | 
				
			||||||
 | 
					    LatticeComplex coor(&Fine);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for(int d=0;d<Nd;d++){
 | 
				
			||||||
 | 
					      LatticeCoordinate(coor,d);
 | 
				
			||||||
 | 
					      lex = lex + coor*stride;
 | 
				
			||||||
 | 
					      stride=stride/10;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    Foo=lex;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for(int dir=0;dir<4;dir++){
 | 
					    for(int dir=0;dir<4;dir++){
 | 
				
			||||||
      for(int disp=0;disp<Fine._fdimensions[dir];disp++){
 | 
					      for(int disp=0;disp<Fine._fdimensions[dir];disp++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	std::cout<<GridLogMessage << "Using stencil to shift dim "<<dir<< " by "<<disp<<std::endl;
 | 
						std::cout<< std::fixed <<GridLogMessage << "Using stencil to shift dim "<<dir<< " by "<<disp<<std::endl;
 | 
				
			||||||
	// start to test the Cartesian npoint stencil infrastructure
 | 
						// start to test the Cartesian npoint stencil infrastructure
 | 
				
			||||||
	int npoint=1;
 | 
						int npoint=1;
 | 
				
			||||||
	std::vector<int> directions(npoint,dir);
 | 
						std::vector<int> directions(npoint,dir);
 | 
				
			||||||
@@ -48,8 +69,8 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
	  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
 | 
						  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	
 | 
						
 | 
				
			||||||
	std::vector<vColourMatrix,alignedAllocator<vColourMatrix> >  comm_buf(myStencil._unified_buffer_size);
 | 
						std::vector<vobj,alignedAllocator<vobj> >  comm_buf(myStencil._unified_buffer_size);
 | 
				
			||||||
	SimpleCompressor<vColourMatrix> compress;
 | 
						SimpleCompressor<vobj> compress;
 | 
				
			||||||
	myStencil.HaloExchange(Foo,comm_buf,compress);
 | 
						myStencil.HaloExchange(Foo,comm_buf,compress);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	Bar = Cshift(Foo,dir,disp);
 | 
						Bar = Cshift(Foo,dir,disp);
 | 
				
			||||||
@@ -75,9 +96,114 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
	Real nrm  = norm2(Diff);
 | 
						Real nrm  = norm2(Diff);
 | 
				
			||||||
	std::cout<<GridLogMessage<<"N2diff ="<<nrm<<" "<<nrmC<<" " <<nrmB<<std::endl;
 | 
						std::cout<<GridLogMessage<<"N2diff ="<<nrm<<" "<<nrmC<<" " <<nrmB<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	Real snrmC =0;
 | 
						std::vector<int> coor(4);
 | 
				
			||||||
	Real snrmB =0;
 | 
						for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
 | 
				
			||||||
	Real snrm  =0;
 | 
						for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
 | 
				
			||||||
 | 
						for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
 | 
				
			||||||
 | 
						for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  RealD diff;
 | 
				
			||||||
 | 
						  sobj check,bar;
 | 
				
			||||||
 | 
						  peekSite(check,Check,coor);
 | 
				
			||||||
 | 
						  peekSite(bar,Bar,coor);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  sobj ddiff;
 | 
				
			||||||
 | 
						  ddiff = check -bar;
 | 
				
			||||||
 | 
						  diff =norm2(ddiff);
 | 
				
			||||||
 | 
						  if ( diff > 0){
 | 
				
			||||||
 | 
						    std::cout <<"Coor (" << coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]
 | 
				
			||||||
 | 
							      <<") " <<check<<" vs "<<bar<<std::endl;
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						 
 | 
				
			||||||
 | 
						}}}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    std::cout<<GridLogMessage<<"Testing RedBlack\n ";
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Field EFoo(&rbFine);
 | 
				
			||||||
 | 
					  Field OFoo(&rbFine);
 | 
				
			||||||
 | 
					  Field ECheck(&rbFine);
 | 
				
			||||||
 | 
					  Field OCheck(&rbFine);
 | 
				
			||||||
 | 
					  pickCheckerboard(Even,EFoo,Foo);
 | 
				
			||||||
 | 
					  pickCheckerboard(Odd ,OFoo,Foo);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for(int dir=0;dir<4;dir++){
 | 
				
			||||||
 | 
					      for(int disp=0;disp<rbFine._fdimensions[dir];disp++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage << "Using stencil to shift rb dim "<<dir<< " by "<<disp<<std::endl;
 | 
				
			||||||
 | 
						// start to test the Cartesian npoint stencil infrastructure
 | 
				
			||||||
 | 
						int npoint=1;
 | 
				
			||||||
 | 
						std::vector<int> directions(npoint,dir);
 | 
				
			||||||
 | 
						std::vector<int> displacements(npoint,disp);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						CartesianStencil EStencil(&rbFine,npoint,Even,directions,displacements);
 | 
				
			||||||
 | 
						CartesianStencil OStencil(&rbFine,npoint,Odd,directions,displacements);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						std::vector<int> ocoor(4);
 | 
				
			||||||
 | 
						for(int o=0;o<Fine.oSites();o++){
 | 
				
			||||||
 | 
						  Fine.oCoorFromOindex(ocoor,o);
 | 
				
			||||||
 | 
						  ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir];
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						std::vector<vobj,alignedAllocator<vobj> >  Ecomm_buf(EStencil._unified_buffer_size);
 | 
				
			||||||
 | 
						std::vector<vobj,alignedAllocator<vobj> >  Ocomm_buf(OStencil._unified_buffer_size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						SimpleCompressor<vobj> compress;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						EStencil.HaloExchange(EFoo,Ecomm_buf,compress);
 | 
				
			||||||
 | 
						OStencil.HaloExchange(OFoo,Ocomm_buf,compress);
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						Bar = Cshift(Foo,dir,disp);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if ( disp & 0x1 ) {
 | 
				
			||||||
 | 
						  ECheck.checkerboard = Even;
 | 
				
			||||||
 | 
						  OCheck.checkerboard = Odd;
 | 
				
			||||||
 | 
						} else { 
 | 
				
			||||||
 | 
						  ECheck.checkerboard = Odd;
 | 
				
			||||||
 | 
						  OCheck.checkerboard = Even;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						// Implement a stencil code that should agree with that darn cshift!
 | 
				
			||||||
 | 
						for(int i=0;i<OCheck._grid->oSites();i++){
 | 
				
			||||||
 | 
						  int permute_type;
 | 
				
			||||||
 | 
						  StencilEntry *SE;
 | 
				
			||||||
 | 
						  SE = EStencil.GetEntry(permute_type,0,i);
 | 
				
			||||||
 | 
						  std::cout << "Even source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						  if ( SE->_is_local && SE->_permute )
 | 
				
			||||||
 | 
						    permute(OCheck._odata[i],EFoo._odata[SE->_offset],permute_type);
 | 
				
			||||||
 | 
						  else if (SE->_is_local)
 | 
				
			||||||
 | 
						    OCheck._odata[i] = EFoo._odata[SE->_offset];
 | 
				
			||||||
 | 
						  else 
 | 
				
			||||||
 | 
						    OCheck._odata[i] = Ecomm_buf[SE->_offset];
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						for(int i=0;i<ECheck._grid->oSites();i++){
 | 
				
			||||||
 | 
						  int permute_type;
 | 
				
			||||||
 | 
						  StencilEntry *SE;
 | 
				
			||||||
 | 
						  SE = OStencil.GetEntry(permute_type,0,i);
 | 
				
			||||||
 | 
						  std::cout << "ODD source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
 | 
				
			||||||
 | 
						  
 | 
				
			||||||
 | 
						  if ( SE->_is_local && SE->_permute )
 | 
				
			||||||
 | 
						    permute(ECheck._odata[i],OFoo._odata[SE->_offset],permute_type);
 | 
				
			||||||
 | 
						  else if (SE->_is_local)
 | 
				
			||||||
 | 
						    ECheck._odata[i] = OFoo._odata[SE->_offset];
 | 
				
			||||||
 | 
						  else 
 | 
				
			||||||
 | 
						    ECheck._odata[i] = Ocomm_buf[SE->_offset];
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						setCheckerboard(Check,ECheck);
 | 
				
			||||||
 | 
						setCheckerboard(Check,OCheck);
 | 
				
			||||||
 | 
						
 | 
				
			||||||
 | 
						Real nrmC = norm2(Check);
 | 
				
			||||||
 | 
						Real nrmB = norm2(Bar);
 | 
				
			||||||
 | 
						Diff = Check-Bar;
 | 
				
			||||||
 | 
						Real nrm  = norm2(Diff);
 | 
				
			||||||
 | 
						std::cout<<GridLogMessage<<"RB N2diff ="<<nrm<<" "<<nrmC<<" " <<nrmB<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	std::vector<int> coor(4);
 | 
						std::vector<int> coor(4);
 | 
				
			||||||
	for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
 | 
						for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
 | 
				
			||||||
@@ -85,33 +211,22 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
	for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
 | 
						for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
 | 
				
			||||||
	for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
 | 
						for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  Complex diff;
 | 
						  RealD diff;
 | 
				
			||||||
	  ColourMatrix check,bar;
 | 
						  sobj check,bar;
 | 
				
			||||||
	  peekSite(check,Check,coor);
 | 
						  peekSite(check,Check,coor);
 | 
				
			||||||
	  peekSite(bar,Bar,coor);
 | 
						  peekSite(bar,Bar,coor);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	  for(int r=0;r<3;r++){
 | 
						  sobj ddiff;
 | 
				
			||||||
	  for(int c=0;c<3;c++){
 | 
						  ddiff = check -bar;
 | 
				
			||||||
            diff =check()()(r,c)-bar()()(r,c);
 | 
						  diff =norm2(ddiff);
 | 
				
			||||||
            double nn=real(conjugate(diff)*diff);
 | 
						  if ( diff > 0){
 | 
				
			||||||
            if ( nn > 0){
 | 
						    std::cout <<"Coor (" << coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3] <<") "
 | 
				
			||||||
	      printf("Coor (%d %d %d %d) \t rc %d%d \t %le (%le,%le) %le\n",
 | 
							      <<"shift "<<disp<<" dir "<< dir 
 | 
				
			||||||
		     coor[0],coor[1],coor[2],coor[3],r,c,
 | 
							      << "  stencil impl " <<check<<" vs cshift impl "<<bar<<std::endl;
 | 
				
			||||||
		     nn,
 | 
						  }
 | 
				
			||||||
		     real(check()()(r,c)),
 | 
					 | 
				
			||||||
		     imag(check()()(r,c)),
 | 
					 | 
				
			||||||
		     real(bar()()(r,c))
 | 
					 | 
				
			||||||
		     );
 | 
					 | 
				
			||||||
	    }
 | 
					 | 
				
			||||||
	    snrmC=snrmC+real(conjugate(check()()(r,c))*check()()(r,c));
 | 
					 | 
				
			||||||
	    snrmB=snrmB+real(conjugate(bar()()(r,c))*bar()()(r,c));
 | 
					 | 
				
			||||||
	    snrm=snrm+nn;
 | 
					 | 
				
			||||||
	  }}
 | 
					 | 
				
			||||||
	 
 | 
						 
 | 
				
			||||||
	}}}}
 | 
						}}}}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	std::cout<<GridLogMessage<<"scalar N2diff = "<<snrm<<" " <<snrmC<<" "<<snrmB<<std::endl;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
      }
 | 
					      }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										123
									
								
								tests/Test_synthetic_lanczos.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										123
									
								
								tests/Test_synthetic_lanczos.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,123 @@
 | 
				
			|||||||
 | 
					#include <fenv.h>
 | 
				
			||||||
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					using namespace std;
 | 
				
			||||||
 | 
					using namespace Grid;
 | 
				
			||||||
 | 
					using namespace Grid::QCD;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static int
 | 
				
			||||||
 | 
					FEenableexcept (unsigned int excepts)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  static fenv_t fenv;
 | 
				
			||||||
 | 
					  unsigned int new_excepts = excepts & FE_ALL_EXCEPT,
 | 
				
			||||||
 | 
					    old_excepts;  // previous masks
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if ( fegetenv (&fenv) ) return -1;
 | 
				
			||||||
 | 
					  old_excepts = fenv.__control & FE_ALL_EXCEPT;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // unmask
 | 
				
			||||||
 | 
					  fenv.__control &= ~new_excepts;
 | 
				
			||||||
 | 
					  fenv.__mxcsr   &= ~(new_excepts << 7);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  return ( fesetenv (&fenv) ? -1 : old_excepts );
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					template<class Field> class DumbOperator  : public LinearOperatorBase<Field> {
 | 
				
			||||||
 | 
					public:
 | 
				
			||||||
 | 
					  LatticeComplex scale;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  DumbOperator(GridBase *grid)    : scale(grid)
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    GridParallelRNG  pRNG(grid);  
 | 
				
			||||||
 | 
					    std::vector<int> seeds({5,6,7,8});
 | 
				
			||||||
 | 
					    pRNG.SeedFixedIntegers(seeds);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    random(pRNG,scale);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    scale = exp(-real(scale)*6.0);
 | 
				
			||||||
 | 
					    std::cout << " True matrix \n"<< scale <<std::endl;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Support for coarsening to a multigrid
 | 
				
			||||||
 | 
					  void OpDiag (const Field &in, Field &out) {};
 | 
				
			||||||
 | 
					  void OpDir  (const Field &in, Field &out,int dir,int disp){};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  void Op     (const Field &in, Field &out){
 | 
				
			||||||
 | 
					    out = scale * in;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  void AdjOp  (const Field &in, Field &out){
 | 
				
			||||||
 | 
					    out = scale * in;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  void HermOp(const Field &in, Field &out){
 | 
				
			||||||
 | 
					    double n1, n2;
 | 
				
			||||||
 | 
					    HermOpAndNorm(in,out,n1,n2);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  void HermOpAndNorm(const Field &in, Field &out,double &n1,double &n2){
 | 
				
			||||||
 | 
					    ComplexD dot;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    out = scale * in;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    dot= innerProduct(in,out);
 | 
				
			||||||
 | 
					    n1=real(dot);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    dot = innerProduct(out,out);
 | 
				
			||||||
 | 
					    n2=real(dot);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int main (int argc, char ** argv)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  FEenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT); 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Grid_init(&argc,&argv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridCartesian *grid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
 | 
				
			||||||
 | 
											       GridDefaultSimd(Nd,vComplex::Nsimd()),
 | 
				
			||||||
 | 
											       GridDefaultMpi());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridParallelRNG  RNG(grid);  
 | 
				
			||||||
 | 
					  std::vector<int> seeds({1,2,3,4});
 | 
				
			||||||
 | 
					  RNG.SeedFixedIntegers(seeds);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  RealD alpha = 1.0;
 | 
				
			||||||
 | 
					  RealD beta  = 0.03;
 | 
				
			||||||
 | 
					  RealD mu    = 0.0;
 | 
				
			||||||
 | 
					  int order = 11;
 | 
				
			||||||
 | 
					  ChebyshevLanczos<LatticeComplex> Cheby(alpha,beta,mu,order);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::ofstream file("pooh.dat");
 | 
				
			||||||
 | 
					  Cheby.csv(file);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  HermOpOperatorFunction<LatticeComplex> X;
 | 
				
			||||||
 | 
					  DumbOperator<LatticeComplex> HermOp(grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  const int Nk = 40;
 | 
				
			||||||
 | 
					  const int Nm = 80;
 | 
				
			||||||
 | 
					  const int Nit= 10000;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int Nconv;
 | 
				
			||||||
 | 
					  RealD eresid = 1.0e-8;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ImplicitlyRestartedLanczos<LatticeComplex> IRL(HermOp,X,Nk,Nm,eresid,Nit);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ImplicitlyRestartedLanczos<LatticeComplex> ChebyIRL(HermOp,Cheby,Nk,Nm,eresid,Nit);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeComplex src(grid); gaussian(RNG,src);
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    std::vector<RealD>          eval(Nm);
 | 
				
			||||||
 | 
					    std::vector<LatticeComplex> evec(Nm,grid);
 | 
				
			||||||
 | 
					    IRL.calc(eval,evec,src, Nconv);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    std::vector<RealD>          eval(Nm);
 | 
				
			||||||
 | 
					    std::vector<LatticeComplex> evec(Nm,grid);
 | 
				
			||||||
 | 
					    ChebyIRL.calc(eval,evec,src, Nconv);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  Grid_finalize();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										293
									
								
								tests/Test_zmm.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										293
									
								
								tests/Test_zmm.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,293 @@
 | 
				
			|||||||
 | 
					#include <Grid.h>
 | 
				
			||||||
 | 
					#include <PerfCount.h>
 | 
				
			||||||
 | 
					#include <simd/Avx512Asm.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					using namespace Grid;
 | 
				
			||||||
 | 
					using namespace Grid::QCD;
 | 
				
			||||||
 | 
					void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3);
 | 
				
			||||||
 | 
					void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3);
 | 
				
			||||||
 | 
					void TimesIAvx512F(void *ptr1,void *ptr3);
 | 
				
			||||||
 | 
					void TimesIAvx512(void *ptr1,void *ptr3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int main(int argc,char **argv)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  Grid_init(&argc,&argv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  std::vector<int> latt4 = GridDefaultLatt();
 | 
				
			||||||
 | 
					  const int Ls=16;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
				
			||||||
 | 
					  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
 | 
				
			||||||
 | 
					  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
				
			||||||
 | 
					  int threads = GridThread::GetThreads();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<int> seeds4({1,2,3,4});
 | 
				
			||||||
 | 
					  std::vector<int> seeds5({5,6,7,8});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  vColourMatrixD mat;
 | 
				
			||||||
 | 
					  vHalfSpinColourVectorD vec;
 | 
				
			||||||
 | 
					  vHalfSpinColourVectorD matvec;
 | 
				
			||||||
 | 
					  vHalfSpinColourVectorD ref;
 | 
				
			||||||
 | 
					  vComplexD err;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  random(sRNG,mat);
 | 
				
			||||||
 | 
					  random(sRNG,vec);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ref = mat*vec;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  WilsonDslashAvx512((void *)&vec, (void *)&mat,(void *)&matvec);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  ref = ref - matvec;
 | 
				
			||||||
 | 
					  err = TensorRemove(innerProduct(ref,ref));
 | 
				
			||||||
 | 
					  std::cout <<"Double SU3 x 2spin diff   "<< Reduce(err)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  vColourMatrixF matF;
 | 
				
			||||||
 | 
					  vHalfSpinColourVectorF vecF;
 | 
				
			||||||
 | 
					  vHalfSpinColourVectorF matvecF;
 | 
				
			||||||
 | 
					  vHalfSpinColourVectorF refF;
 | 
				
			||||||
 | 
					  vComplexF errF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  random(sRNG,matF);
 | 
				
			||||||
 | 
					  random(sRNG,vecF);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  refF = matF*vecF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  WilsonDslashAvx512F((void *)&vecF, (void *)&matF,(void *)&matvecF);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  refF = refF-matvecF;
 | 
				
			||||||
 | 
					  errF = TensorRemove(innerProduct(refF,refF));
 | 
				
			||||||
 | 
					  std::cout <<"Single SU3 x 2spin diff   "<< Reduce(errF)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  TimesIAvx512F((void *)&vecF,(void *)&matvecF);
 | 
				
			||||||
 | 
					  refF = timesI(vecF)-matvecF;
 | 
				
			||||||
 | 
					  errF = TensorRemove(innerProduct(refF,refF));
 | 
				
			||||||
 | 
					  std::cout <<" timesI single diff  "<< Reduce(errF)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  TimesIAvx512((void *)&vec,(void *)&matvec);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  ref = timesI(vec)-matvec;
 | 
				
			||||||
 | 
					  err = TensorRemove(innerProduct(ref,ref));
 | 
				
			||||||
 | 
					  std::cout <<" timesI double diff  "<< Reduce(err)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeFermion src (FGrid);
 | 
				
			||||||
 | 
					  LatticeFermion srce(FrbGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeFermion resulto(FrbGrid); resulto=zero;
 | 
				
			||||||
 | 
					  LatticeFermion resulta(FrbGrid); resulta=zero;
 | 
				
			||||||
 | 
					  LatticeFermion diff(FrbGrid); 
 | 
				
			||||||
 | 
					  LatticeGaugeField Umu(UGrid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if 1
 | 
				
			||||||
 | 
					  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
				
			||||||
 | 
					  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
				
			||||||
 | 
					  random(RNG5,src);
 | 
				
			||||||
 | 
					  random(RNG4,Umu);
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
					  int mmu=3;
 | 
				
			||||||
 | 
					  std::vector<LatticeColourMatrix> U(4,UGrid);
 | 
				
			||||||
 | 
					  for(int mu=0;mu<Nd;mu++){
 | 
				
			||||||
 | 
					    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
 | 
				
			||||||
 | 
					    if ( mu!=mmu ) U[mu] = zero;
 | 
				
			||||||
 | 
					    if ( mu==mmu ) U[mu] = 1.0;
 | 
				
			||||||
 | 
					    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					 pickCheckerboard(Even,srce,src);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  RealD mass=0.1;
 | 
				
			||||||
 | 
					  RealD M5  =1.8;
 | 
				
			||||||
 | 
					  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
 | 
				
			||||||
 | 
					  int ncall=50;
 | 
				
			||||||
 | 
					  double t0=usecond();
 | 
				
			||||||
 | 
					  for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					    Dw.DhopOE(srce,resulto,0);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  double t1=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
				
			||||||
 | 
					  double flops=1344*volume/2;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Called Dw"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "norm result "<< norm2(resulto)<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "mflop/s =   "<< flops*ncall/(t1-t0)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  QCD::WilsonFermion5DStatic::AsmOptDslash=1;
 | 
				
			||||||
 | 
					  t0=usecond();
 | 
				
			||||||
 | 
					  for(int i=0;i<ncall;i++){
 | 
				
			||||||
 | 
					    Dw.DhopOE(srce,resulta,0);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  t1=usecond();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
				
			||||||
 | 
					    Dw.DhopOE(srce,resulta,0);
 | 
				
			||||||
 | 
					    PerformanceCounter Counter(i);
 | 
				
			||||||
 | 
					    Counter.Start();
 | 
				
			||||||
 | 
					    Dw.DhopOE(srce,resulta,0);
 | 
				
			||||||
 | 
					    Counter.Stop();
 | 
				
			||||||
 | 
					    Counter.Report();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  resulta = (-0.5) * resulta;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "Called Asm Dw"<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "norm result "<< norm2(resulta)<<std::endl;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "mflop/s =   "<< flops*ncall/(t1-t0)<<std::endl;
 | 
				
			||||||
 | 
					  diff = resulto-resulta;
 | 
				
			||||||
 | 
					  std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VLOAD
 | 
				
			||||||
 | 
					#undef VSTORE
 | 
				
			||||||
 | 
					#undef VMUL
 | 
				
			||||||
 | 
					#undef VMADD
 | 
				
			||||||
 | 
					#undef ZEND1
 | 
				
			||||||
 | 
					#undef ZEND2
 | 
				
			||||||
 | 
					#undef ZLOAD
 | 
				
			||||||
 | 
					#undef ZMUL
 | 
				
			||||||
 | 
					#undef ZMADD
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VZERO(A) VZEROd(A)
 | 
				
			||||||
 | 
					#define VTIMESI(A,B,C) VTIMESId(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI(A,B,C) VTIMESMINUSId(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VLOAD(OFF,PTR,DEST)       VLOADd(OFF,PTR,DEST)
 | 
				
			||||||
 | 
					#define VSTORE(OFF,PTR,SRC)       VSTOREd(OFF,PTR,SRC)
 | 
				
			||||||
 | 
					#define VMUL(Uri,Uir,Chi,UChi,Z)  VMULd(Uri,Uir,Chi,UChi,Z)
 | 
				
			||||||
 | 
					#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDd(Uri,Uir,Chi,UChi,Z)
 | 
				
			||||||
 | 
					#define ZEND1(A,B,C)               ZEND1d(A,B,C)
 | 
				
			||||||
 | 
					#define ZEND2(A,B,C)               ZEND2d(A,B,C)
 | 
				
			||||||
 | 
					#define ZLOAD(A,B,C,D)            ZLOADd(A,B,C,D)
 | 
				
			||||||
 | 
					#define ZMUL(A,B,C,D,E)           ZMULd(A,B,C,D,E)
 | 
				
			||||||
 | 
					#define ZMADD(A,B,C,D,E)          ZMADDd(A,B,C,D,E)
 | 
				
			||||||
 | 
					#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
				
			||||||
 | 
					#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define zz Z0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void TimesIAvx512(void *ptr1,void *ptr3)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  __asm__ ("mov     $0xAAAA, %%eax "  : : :"%eax");
 | 
				
			||||||
 | 
					  __asm__ ("kmov    %%eax, %%k6 " : : :);
 | 
				
			||||||
 | 
					  __asm__ ("knot     %%k6, %%k7 " : : :);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  MASK_REGS;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  LOAD_CHI(ptr1);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  __asm__ (
 | 
				
			||||||
 | 
					  VZERO(zz)
 | 
				
			||||||
 | 
					  VTIMESI(Chi_00,UChi_00,zz)
 | 
				
			||||||
 | 
					  VTIMESI(Chi_01,UChi_01,zz)
 | 
				
			||||||
 | 
					  VTIMESI(Chi_02,UChi_02,zz)
 | 
				
			||||||
 | 
					  VTIMESI(Chi_10,UChi_10,zz)
 | 
				
			||||||
 | 
					  VTIMESI(Chi_11,UChi_11,zz)
 | 
				
			||||||
 | 
					  VTIMESI(Chi_12,UChi_12,zz)
 | 
				
			||||||
 | 
					  );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  SAVE_UCHI(ptr3);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void WilsonDslashAvx512(void *ptr1,void *ptr2,void *ptr3)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  int return_address;
 | 
				
			||||||
 | 
					  // prototype computed goto to eliminate ABI save restore on call/return in
 | 
				
			||||||
 | 
					  // generated assembly.
 | 
				
			||||||
 | 
					  static void * table[] = { &&save, &&mult };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  MASK_REGS;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LOAD_CHI(ptr1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  return_address = 0;
 | 
				
			||||||
 | 
					  goto mult;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 save:
 | 
				
			||||||
 | 
					  SAVE_UCHI(ptr3);
 | 
				
			||||||
 | 
					  return;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 mult:
 | 
				
			||||||
 | 
					  MULT_2SPIN(ptr2);
 | 
				
			||||||
 | 
					  goto *table[return_address];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#undef VLOAD
 | 
				
			||||||
 | 
					#undef VSTORE
 | 
				
			||||||
 | 
					#undef VMUL
 | 
				
			||||||
 | 
					#undef VMADD
 | 
				
			||||||
 | 
					#undef ZEND1
 | 
				
			||||||
 | 
					#undef ZEND2
 | 
				
			||||||
 | 
					#undef ZLOAD
 | 
				
			||||||
 | 
					#undef ZMUL
 | 
				
			||||||
 | 
					#undef ZMADD
 | 
				
			||||||
 | 
					#undef VZERO
 | 
				
			||||||
 | 
					#undef VTIMESI
 | 
				
			||||||
 | 
					#undef VTIMESI0
 | 
				
			||||||
 | 
					#undef VTIMESI1
 | 
				
			||||||
 | 
					#undef VTIMESI2
 | 
				
			||||||
 | 
					#undef VTIMESMINUSI
 | 
				
			||||||
 | 
					#undef ZMULMEM2SP
 | 
				
			||||||
 | 
					#undef ZMADDMEM2SP
 | 
				
			||||||
 | 
					#define VZERO(A) VZEROf(A)
 | 
				
			||||||
 | 
					#define VMOV(A,B) VMOVf(A,B)
 | 
				
			||||||
 | 
					#define VADD(A,B,C) VADDf(A,B,C)
 | 
				
			||||||
 | 
					#define VSUB(A,B,C) VSUBf(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
 | 
				
			||||||
 | 
					#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define VLOAD(OFF,PTR,DEST)       VLOADf(OFF,PTR,DEST)
 | 
				
			||||||
 | 
					#define VSTORE(OFF,PTR,SRC)       VSTOREf(OFF,PTR,SRC)
 | 
				
			||||||
 | 
					#define VMUL(Uri,Uir,Chi,UChi,Z)  VMULf(Uri,Uir,Chi,UChi,Z)
 | 
				
			||||||
 | 
					#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z)
 | 
				
			||||||
 | 
					#define ZEND1(A,B,C)               ZEND1f(A,B,C)
 | 
				
			||||||
 | 
					#define ZEND2(A,B,C)               ZEND2f(A,B,C)
 | 
				
			||||||
 | 
					#define ZLOAD(A,B,C,D)            ZLOADf(A,B,C,D)
 | 
				
			||||||
 | 
					#define ZMUL(A,B,C,D,E)           ZMULf(A,B,C,D,E)
 | 
				
			||||||
 | 
					#define ZMADD(A,B,C,D,E)          ZMADDf(A,B,C,D,E)
 | 
				
			||||||
 | 
					#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)  ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
				
			||||||
 | 
					#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void TimesIAvx512F(void *ptr1,void *ptr3)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  MASK_REGS;
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  LOAD_CHI(ptr1);
 | 
				
			||||||
 | 
					  __asm__ (
 | 
				
			||||||
 | 
					  VZERO(zz)
 | 
				
			||||||
 | 
					  VTIMESI(Chi_00,UChi_00,zz)
 | 
				
			||||||
 | 
					  VTIMESI(Chi_01,UChi_01,zz)
 | 
				
			||||||
 | 
					  VTIMESI(Chi_02,UChi_02,zz)
 | 
				
			||||||
 | 
					  VTIMESI(Chi_10,UChi_10,zz)
 | 
				
			||||||
 | 
					  VTIMESI(Chi_11,UChi_11,zz)
 | 
				
			||||||
 | 
					  VTIMESI(Chi_12,UChi_12,zz)
 | 
				
			||||||
 | 
						   );
 | 
				
			||||||
 | 
					  SAVE_UCHI(ptr3);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  MASK_REGS;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LOAD_CHI(ptr1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  MULT_2SPIN(ptr2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  SAVE_UCHI(ptr3);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  return;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		Reference in New Issue
	
	Block a user