mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-22 16:54:48 +01:00 
			
		
		
		
	Compare commits
	
		
			45 Commits
		
	
	
		
			feature/hw
			...
			DIRAC-ITT-
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 12e239dd9f | ||
|  | af2301afbb | ||
|  | f98856a26f | ||
|  | d55cc5b380 | ||
| c2b688abc9 | |||
| b0d61b9687 | |||
| 5f893bf9af | |||
| 0e17bd6597 | |||
| 22caa158cc | |||
| b24a504d7c | |||
|  | 992ef6e9fc | ||
|  | f32a320bc3 | ||
|  | 5f0fe029d2 | ||
| 6b1486e89b | |||
|  | 3f9c427a3a | ||
|  | d201277652 | ||
| fdda7cf9cf | |||
| e22d30f715 | |||
| 1ba25a0d8c | |||
| 9ba3647bdf | |||
| 5ee832f738 | |||
|  | 35a69a5133 | ||
| e9c5a271a8 | |||
| acac2d6938 | |||
| 97db2b8d20 | |||
|  | ace9cd64bb | ||
|  | a3e2aeb603 | ||
|  | 049dd25785 | ||
|  | d43d372294 | ||
|  | b71a081cba | ||
|  | c48909590b | ||
|  | 446ef40570 | ||
|  | 81441e98f4 | ||
|  | ecd3f890f5 | ||
|  | 1c881ce23c | ||
|  | dacbbdd051 | ||
|  | 2859955a03 | ||
|  | cc220abd1d | ||
|  | d1c0c0197e | ||
|  | fd9424ef27 | ||
|  | a5c35c4024 | ||
|  | e03b64dc06 | ||
|  | 4677c40195 | ||
|  | 288c615782 | ||
|  | 48e81cf6f8 | 
| @@ -9,11 +9,6 @@ matrix: | ||||
|     - os:        osx | ||||
|       osx_image: xcode8.3 | ||||
|       compiler: clang | ||||
|       env: PREC=single | ||||
|     - os:        osx | ||||
|       osx_image: xcode8.3 | ||||
|       compiler: clang | ||||
|       env: PREC=double | ||||
|        | ||||
| before_install: | ||||
|     - export GRIDDIR=`pwd` | ||||
| @@ -55,7 +50,7 @@ script: | ||||
|     - make -j4 | ||||
|     - make install | ||||
|     - cd $CWD/build | ||||
|     - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF} | ||||
|     - ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF} | ||||
|     - make -j4  | ||||
|     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals | ||||
|     - make check | ||||
|   | ||||
| @@ -34,6 +34,12 @@ | ||||
| #define __SYCL__REDEFINE__ | ||||
| #endif | ||||
|  | ||||
| /* HIP save and restore compile environment*/ | ||||
| #ifdef GRID_HIP | ||||
| #pragma push | ||||
| #pragma push_macro("__HIP_DEVICE_COMPILE__") | ||||
| #endif | ||||
| #define EIGEN_NO_HIP | ||||
|  | ||||
| #include <Grid/Eigen/Dense> | ||||
| #include <Grid/Eigen/unsupported/CXX11/Tensor> | ||||
| @@ -52,6 +58,12 @@ | ||||
| #pragma pop | ||||
| #endif | ||||
|  | ||||
| /*HIP restore*/ | ||||
| #ifdef __HIP__REDEFINE__ | ||||
| #pragma pop_macro("__HIP_DEVICE_COMPILE__") | ||||
| #pragma pop | ||||
| #endif | ||||
|  | ||||
| #if defined __GNUC__ | ||||
| #pragma GCC diagnostic pop | ||||
| #endif | ||||
|   | ||||
| @@ -49,11 +49,13 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner, | ||||
|   Lattice<dotp> fine_inner_msk(fine); | ||||
|  | ||||
|   // Multiply could be fused with innerProduct | ||||
|   // Single block sum kernel could do both masks. | ||||
|   fine_inner = localInnerProduct(fineX,fineY); | ||||
|   mult(fine_inner_msk, fine_inner,FineMask); | ||||
|   blockSum(CoarseInner,fine_inner_msk); | ||||
| } | ||||
|  | ||||
|  | ||||
| class Geometry { | ||||
| public: | ||||
|   int npoint; | ||||
| @@ -78,12 +80,8 @@ public: | ||||
|     } | ||||
|     directions   [2*_d]=0; | ||||
|     displacements[2*_d]=0; | ||||
|  | ||||
|     std::cout <<GridLogMessage << "Geometry "<<std::endl; | ||||
|     for(int p=0;p<npoint;p++){ | ||||
|       std::cout <<GridLogMessage << "point " <<p<<" dir "<<directions[p]<<" delta " <<displacements[p]<<std::endl; | ||||
|     } | ||||
|   } | ||||
|  | ||||
| }; | ||||
|    | ||||
| template<class Fobj,class CComplex,int nbasis> | ||||
| @@ -104,8 +102,8 @@ public: | ||||
|   Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :  | ||||
|     CoarseGrid(_CoarseGrid), | ||||
|     FineGrid(_FineGrid), | ||||
|     checkerboard(_checkerboard), | ||||
|     subspace(nbasis,_FineGrid) | ||||
|     subspace(nbasis,_FineGrid), | ||||
|     checkerboard(_checkerboard) | ||||
|   { | ||||
|   }; | ||||
|    | ||||
| @@ -287,8 +285,6 @@ public: | ||||
|   /////////////////////// | ||||
|   GridBase * Grid(void)         { return _grid; };   // this is all the linalg routines need to know | ||||
|  | ||||
|   virtual std::vector<int> Directions(void)   { return geom.directions; }; | ||||
|   virtual std::vector<int> Displacements(void){ return geom.displacements; }; | ||||
|   void M (const CoarseVector &in, CoarseVector &out) | ||||
|   { | ||||
|     conformable(_grid,in.Grid()); | ||||
| @@ -312,9 +308,6 @@ public: | ||||
|  | ||||
|     int osites=Grid()->oSites(); | ||||
|  | ||||
|     autoView(st,Stencil,AcceleratorRead); | ||||
|     siteVector *CBp=Stencil.CommBuf(); | ||||
|  | ||||
|     accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { | ||||
|       int ss = sss/nbasis; | ||||
|       int b  = sss%nbasis; | ||||
| @@ -325,12 +318,12 @@ public: | ||||
|  | ||||
|       for(int point=0;point<geom.npoint;point++){ | ||||
|  | ||||
| 	SE=st.GetEntry(ptype,point,ss); | ||||
| 	SE=Stencil.GetEntry(ptype,point,ss); | ||||
| 	   | ||||
| 	if(SE->_is_local) {  | ||||
| 	  nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); | ||||
| 	} else { | ||||
| 	  nbr = coalescedRead(CBp[SE->_offset]); | ||||
| 	  nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); | ||||
| 	} | ||||
| 	acceleratorSynchronise(); | ||||
|  | ||||
| @@ -339,7 +332,7 @@ public: | ||||
| 	} | ||||
|       } | ||||
|       coalescedWrite(out_v[ss](b),res); | ||||
|     }); | ||||
|       }); | ||||
|  | ||||
|     for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); | ||||
|   }; | ||||
| @@ -416,23 +409,38 @@ public: | ||||
|       MdirCalc(in,out[p],p); | ||||
|     } | ||||
|   }; | ||||
|   void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp) | ||||
|   { | ||||
|   void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){ | ||||
|  | ||||
|     this->MdirComms(in); | ||||
|  | ||||
|     int ndim = in.Grid()->Nd(); | ||||
|  | ||||
|     int point=-1; | ||||
|     for(int p=0;p<geom.npoint;p++){ | ||||
|       if( (dir==geom.directions[p])&&(disp==geom.displacements[p])) point=p; | ||||
|     } | ||||
|     assert(point!=-1);// Must find | ||||
|     ////////////// | ||||
|     // 4D action like wilson | ||||
|     // 0+ => 0  | ||||
|     // 0- => 1 | ||||
|     // 1+ => 2  | ||||
|     // 1- => 3 | ||||
|     // etc.. | ||||
|     ////////////// | ||||
|     // 5D action like DWF | ||||
|     // 1+ => 0  | ||||
|     // 1- => 1 | ||||
|     // 2+ => 2  | ||||
|     // 2- => 3 | ||||
|     // etc.. | ||||
|     auto point = [dir, disp, ndim](){ | ||||
|       if(dir == 0 and disp == 0) | ||||
| 	return 8; | ||||
|       else if ( ndim==4 ) {  | ||||
| 	return (4 * dir + 1 - disp) / 2; | ||||
|       } else {  | ||||
| 	return (4 * (dir-1) + 1 - disp) / 2; | ||||
|       } | ||||
|     }(); | ||||
|  | ||||
|     std::cout <<GridLogMessage << "Mdir point "<<point<<" dir "<<dir<<" disp "<<disp  <<std::endl; | ||||
|     for(int p=0;p<geom.npoint;p++){ | ||||
|       std::cout <<GridLogMessage << "point " <<p<<" dir "<<geom.directions[p]<<" delta " <<geom.displacements[p]<<std::endl; | ||||
|     } | ||||
|     MdirCalc(in,out,point); | ||||
|  | ||||
|   }; | ||||
|  | ||||
|   void Mdiag(const CoarseVector &in, CoarseVector &out) | ||||
| @@ -448,58 +456,10 @@ public: | ||||
|     geom(CoarseGrid._ndimension), | ||||
|     hermitian(hermitian_), | ||||
|     Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), | ||||
|     A(geom.npoint,&CoarseGrid) | ||||
|       A(geom.npoint,&CoarseGrid) | ||||
|   { | ||||
|   }; | ||||
|  | ||||
|   void Test(Aggregation<Fobj,CComplex,nbasis> &_Aggregates,GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop) | ||||
|   { | ||||
|     typedef Lattice<Fobj> FineField; | ||||
|     CoarseVector Cin(_grid); | ||||
|     CoarseVector Cout(_grid); | ||||
|     CoarseVector CFout(_grid); | ||||
|  | ||||
|     FineField Fin(FineGrid); | ||||
|     FineField Fout(FineGrid); | ||||
|  | ||||
|  | ||||
|     std::vector<int> seeds({1,2,3,4,5}); | ||||
|     GridParallelRNG RNG(_grid);  RNG.SeedFixedIntegers(seeds); | ||||
|     gaussian(RNG,Cin); | ||||
|  | ||||
|     _Aggregates.PromoteFromSubspace(Cin,Fin); | ||||
|     _Aggregates.ProjectToSubspace(Cin,Fin); | ||||
|  | ||||
|     std::cout << GridLogMessage<< "************  "<<std::endl; | ||||
|     std::cout << GridLogMessage<< " Testing M  "<<std::endl; | ||||
|     std::cout << GridLogMessage<< "************  "<<std::endl; | ||||
|     // Coarse operator | ||||
|     this->M(Cin,Cout); | ||||
|     // Fine projected operator | ||||
|     _Aggregates.PromoteFromSubspace(Cin,Fin); | ||||
|     linop.Op(Fin,Fout); | ||||
|     _Aggregates.ProjectToSubspace(CFout,Fout); | ||||
|  | ||||
|     CFout = CFout-Cout; | ||||
|     RealD diff = norm2(CFout); | ||||
|     std::cout << GridLogMessage<< " diff  "<<diff<<std::endl; | ||||
|     assert(diff<1.0e-5); | ||||
|  | ||||
|     std::cout << GridLogMessage<< "************  "<<std::endl; | ||||
|     std::cout << GridLogMessage<< " Testing Mdag  "<<std::endl; | ||||
|     std::cout << GridLogMessage<< "************  "<<std::endl; | ||||
|     // Coarse operator | ||||
|     Mdag(Cin,Cout); | ||||
|     // Fine operator | ||||
|     linop.AdjOp(Fin,Fout); | ||||
|     _Aggregates.ProjectToSubspace(CFout,Fout); | ||||
|  | ||||
|     CFout = CFout-Cout; | ||||
|     diff = norm2(CFout); | ||||
|     std::cout << GridLogMessage<< " diff  "<<diff<<std::endl;  | ||||
|     assert(diff<1.0e-5); | ||||
|   } | ||||
|  | ||||
|   void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop, | ||||
| 		       Aggregation<Fobj,CComplex,nbasis> & Subspace) | ||||
|   { | ||||
| @@ -536,19 +496,8 @@ public: | ||||
|  | ||||
|     CoarseScalar InnerProd(Grid());  | ||||
|  | ||||
|     std::cout << GridLogMessage<< "CoarsenMatrix Orthog " << std::endl; | ||||
|     // Orthogonalise the subblocks over the basis | ||||
|     blockOrthogonalise(InnerProd,Subspace.subspace); | ||||
|     std::cout << GridLogMessage<< "CoarsenMatrix Orthog done " << std::endl; | ||||
|  | ||||
|     auto OpDirections    = linop.Directions(); | ||||
|     auto OpDisplacements = linop.Displacements(); | ||||
|  | ||||
|     std::cout<<" Coarsening an operator with "<< OpDirections.size()<<" terms "<<std::endl; | ||||
|     for(int p=0;p<OpDirections.size();p++) { | ||||
|       assert(OpDirections[p]==geom.directions[p]); | ||||
|       assert(OpDisplacements[p]==geom.displacements[p]); | ||||
|     } | ||||
|  | ||||
|     // Compute the matrix elements of linop between this orthonormal | ||||
|     // set of vectors. | ||||
| @@ -584,27 +533,13 @@ public: | ||||
|     evenmask = where(mod(bcb,2)==(Integer)0,one,zero); | ||||
|     oddmask  = one-evenmask; | ||||
|  | ||||
|     /* | ||||
|     { | ||||
|       phi=Subspace.subspace[0]; | ||||
|       linop.OpDirAll(phi,Mphi_p); | ||||
|       for(int p=0;p<geom.npoint-1;p++){ | ||||
| 	int dir=geom.directions[p]; | ||||
| 	int disp=geom.displacements[p]; | ||||
| 	linop.OpDir(phi,Mphi,dir,disp); | ||||
| 	Mphi=Mphi-Mphi_p[p]; | ||||
| 	std::cout << GridLogMessage <<" Direction mapping check " <<norm2(Mphi)<<std::endl; | ||||
|       } | ||||
|     } | ||||
| */ | ||||
|     assert(self_stencil!=-1); | ||||
|     int lhermitian=hermitian; | ||||
|  | ||||
|     for(int i=0;i<nbasis;i++){ | ||||
|  | ||||
|       phi=Subspace.subspace[i]; | ||||
|  | ||||
|       std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl; | ||||
|       //      std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl; | ||||
|       linop.OpDirAll(phi,Mphi_p); | ||||
|       linop.OpDiag  (phi,Mphi_p[geom.npoint-1]); | ||||
|  | ||||
| @@ -615,7 +550,7 @@ public: | ||||
| 	int dir   = geom.directions[p]; | ||||
| 	int disp  = geom.displacements[p]; | ||||
|  | ||||
| 	if ( (disp==-1) || (!lhermitian ) ) { | ||||
| 	if ( (disp==-1) || (!hermitian ) ) { | ||||
|  | ||||
| 	  //////////////////////////////////////////////////////////////////////// | ||||
| 	  // Pick out contributions coming from this cell and neighbour cell | ||||
| @@ -633,23 +568,11 @@ public: | ||||
| 	    autoView( A_self  , A[self_stencil], AcceleratorWrite); | ||||
|  | ||||
| 	    accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); | ||||
| 	    if ( lhermitian && (disp==-1) ) { | ||||
| 	      for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>* | ||||
| 		int dirp   = geom.directions[pp]; | ||||
| 		int dispp  = geom.displacements[pp]; | ||||
| 		if ( (dirp==dir) && (dispp==1) ){ | ||||
| 		  auto sft = conjugate(Cshift(oZProj,dir,1)); | ||||
| 		  autoView( sft_v    ,  sft  , AcceleratorWrite); | ||||
| 		  autoView( A_pp     ,  A[pp], AcceleratorWrite); | ||||
| 		  accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); }); | ||||
| 		} | ||||
| 	      } | ||||
| 	    } | ||||
|  | ||||
| 	  } | ||||
| 	} | ||||
|       } | ||||
|  | ||||
|       std::cout << GridLogMessage<< "CoarsenMatrix Diag "<<std::endl; | ||||
|       /////////////////////////////////////////// | ||||
|       // Faster alternate self coupling.. use hermiticity to save 2x | ||||
|       /////////////////////////////////////////// | ||||
| @@ -681,35 +604,31 @@ public: | ||||
|  | ||||
|       } | ||||
|     } | ||||
|  | ||||
|     MemoryManager::PrintBytes(); | ||||
|  | ||||
|     // Auto self test | ||||
|     Test( Subspace,FineGrid,linop); | ||||
|  | ||||
| #if 0 | ||||
|     /////////////////////////// | ||||
|     // test code worth preserving in if block | ||||
|     /////////////////////////// | ||||
|     std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl; | ||||
|     for(int p=0;p<geom.npoint;p++){ | ||||
|       std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl; | ||||
|       std::cout<<GridLogMessage<< "\n"<<A[p] << std::endl; | ||||
|     if(hermitian) { | ||||
|       std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl; | ||||
|       ForceHermitian(); | ||||
|     } | ||||
|     std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl; | ||||
|  | ||||
|     phi=Subspace.subspace[0]; | ||||
|     std::vector<int> bc(FineGrid->_ndimension,0); | ||||
|     blockPick(Grid(),phi,tmp,bc);      // Pick out a block | ||||
|     linop.Op(tmp,Mphi);                // Apply big dop | ||||
|     blockProject(iProj,Mphi,Subspace.subspace); // project it and print it | ||||
|     std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl; | ||||
|     std::cout<<GridLogMessage<< iProj <<std::endl; | ||||
|     std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl; | ||||
| #endif | ||||
|  | ||||
|   } | ||||
|  | ||||
|   void ForceHermitian(void) { | ||||
|     CoarseMatrix Diff  (Grid()); | ||||
|     for(int p=0;p<geom.npoint;p++){ | ||||
|       int dir   = geom.directions[p]; | ||||
|       int disp  = geom.displacements[p]; | ||||
|       if(disp==-1) { | ||||
| 	// Find the opposite link | ||||
| 	for(int pp=0;pp<geom.npoint;pp++){ | ||||
| 	  int dirp   = geom.directions[pp]; | ||||
| 	  int dispp  = geom.displacements[pp]; | ||||
| 	  if ( (dirp==dir) && (dispp==1) ){ | ||||
| 	    //	    Diff = adj(Cshift(A[p],dir,1)) - A[pp];  | ||||
| 	    //	    std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl; | ||||
| 	    A[pp] = adj(Cshift(A[p],dir,1)); | ||||
| 	  } | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| }; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|   | ||||
| @@ -52,9 +52,6 @@ public: | ||||
|   virtual void AdjOp  (const Field &in, Field &out) = 0; // Abstract base | ||||
|   virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0; | ||||
|   virtual void HermOp(const Field &in, Field &out)=0; | ||||
|  | ||||
|   virtual std::vector<int> Directions(void)   =0; | ||||
|   virtual std::vector<int> Displacements(void)=0; | ||||
| }; | ||||
|  | ||||
|  | ||||
| @@ -79,9 +76,6 @@ class MdagMLinearOperator : public LinearOperatorBase<Field> { | ||||
| public: | ||||
|   MdagMLinearOperator(Matrix &Mat): _Mat(Mat){}; | ||||
|  | ||||
|   virtual std::vector<int> Directions(void)   { return _Mat.Directions();}; | ||||
|   virtual std::vector<int> Displacements(void){ return _Mat.Displacements();}; | ||||
|  | ||||
|   // Support for coarsening to a multigrid | ||||
|   void OpDiag (const Field &in, Field &out) { | ||||
|     _Mat.Mdiag(in,out); | ||||
| @@ -117,8 +111,6 @@ class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> { | ||||
|   Matrix &_Mat; | ||||
|   RealD _shift; | ||||
| public: | ||||
|   virtual std::vector<int> Directions(void)   { return _Mat.Directions();}; | ||||
|   virtual std::vector<int> Displacements(void){ return _Mat.Displacements();}; | ||||
|   ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){}; | ||||
|   // Support for coarsening to a multigrid | ||||
|   void OpDiag (const Field &in, Field &out) { | ||||
| @@ -159,8 +151,6 @@ template<class Matrix,class Field> | ||||
| class HermitianLinearOperator : public LinearOperatorBase<Field> { | ||||
|   Matrix &_Mat; | ||||
| public: | ||||
|   virtual std::vector<int> Directions(void)   { return _Mat.Directions();}; | ||||
|   virtual std::vector<int> Displacements(void){ return _Mat.Displacements();}; | ||||
|   HermitianLinearOperator(Matrix &Mat): _Mat(Mat){}; | ||||
|   // Support for coarsening to a multigrid | ||||
|   void OpDiag (const Field &in, Field &out) { | ||||
| @@ -192,8 +182,6 @@ template<class Matrix,class Field> | ||||
| class NonHermitianLinearOperator : public LinearOperatorBase<Field> { | ||||
|   Matrix &_Mat; | ||||
| public: | ||||
|   virtual std::vector<int> Directions(void)   { return _Mat.Directions();}; | ||||
|   virtual std::vector<int> Displacements(void){ return _Mat.Displacements();}; | ||||
|   NonHermitianLinearOperator(Matrix &Mat): _Mat(Mat){}; | ||||
|   // Support for coarsening to a multigrid | ||||
|   void OpDiag (const Field &in, Field &out) { | ||||
| @@ -267,8 +255,6 @@ template<class Matrix,class Field> | ||||
|   class SchurDiagMooeeOperator :  public SchurOperatorBase<Field> { | ||||
|  public: | ||||
|     Matrix &_Mat; | ||||
|     virtual std::vector<int> Directions(void)   { return _Mat.Directions();}; | ||||
|     virtual std::vector<int> Displacements(void){ return _Mat.Displacements();}; | ||||
|     SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){}; | ||||
|     virtual  void Mpc      (const Field &in, Field &out) { | ||||
|       Field tmp(in.Grid()); | ||||
| @@ -295,8 +281,6 @@ template<class Matrix,class Field> | ||||
|  protected: | ||||
|     Matrix &_Mat; | ||||
|  public: | ||||
|     virtual std::vector<int> Directions(void)   { return _Mat.Directions();}; | ||||
|     virtual std::vector<int> Displacements(void){ return _Mat.Displacements();}; | ||||
|     SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){}; | ||||
|      | ||||
|     virtual void Mpc      (const Field &in, Field &out) { | ||||
| @@ -323,8 +307,6 @@ template<class Matrix,class Field> | ||||
|  protected: | ||||
|     Matrix &_Mat; | ||||
|  public: | ||||
|     virtual std::vector<int> Directions(void)   { return _Mat.Directions();}; | ||||
|     virtual std::vector<int> Displacements(void){ return _Mat.Displacements();}; | ||||
|     SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){}; | ||||
|      | ||||
|     virtual void Mpc      (const Field &in, Field &out) { | ||||
| @@ -390,8 +372,6 @@ class NonHermitianSchurDiagMooeeOperator :  public NonHermitianSchurOperatorBase | ||||
| { | ||||
|  public: | ||||
|   Matrix& _Mat; | ||||
|   virtual std::vector<int> Directions(void)   { return _Mat.Directions();}; | ||||
|   virtual std::vector<int> Displacements(void){ return _Mat.Displacements();}; | ||||
|  NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){}; | ||||
|   virtual void Mpc(const Field& in, Field& out) { | ||||
|     Field tmp(in.Grid()); | ||||
| @@ -425,8 +405,6 @@ class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Fi | ||||
|   Matrix &_Mat; | ||||
|    | ||||
|  public: | ||||
|   virtual std::vector<int> Directions(void)   { return _Mat.Directions();}; | ||||
|   virtual std::vector<int> Displacements(void){ return _Mat.Displacements();}; | ||||
|   NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){}; | ||||
|   virtual void Mpc(const Field& in, Field& out) { | ||||
|     Field tmp(in.Grid()); | ||||
| @@ -457,8 +435,6 @@ class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Fi | ||||
|   Matrix& _Mat; | ||||
|    | ||||
|  public: | ||||
|   virtual std::vector<int> Directions(void)   { return _Mat.Directions();}; | ||||
|   virtual std::vector<int> Displacements(void){ return _Mat.Displacements();}; | ||||
|  NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){}; | ||||
|  | ||||
|   virtual void Mpc(const Field& in, Field& out) { | ||||
| @@ -499,8 +475,6 @@ class SchurStaggeredOperator :  public SchurOperatorBase<Field> { | ||||
|   Field tmp; | ||||
|   RealD mass; | ||||
|  public: | ||||
|   virtual std::vector<int> Directions(void)   { return _Mat.Directions();}; | ||||
|   virtual std::vector<int> Displacements(void){ return _Mat.Displacements();}; | ||||
|   SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())  | ||||
|   {  | ||||
|     assert( _Mat.isTrivialEE() ); | ||||
|   | ||||
| @@ -48,8 +48,6 @@ public: | ||||
|   virtual  void Mdiag    (const Field &in, Field &out)=0; | ||||
|   virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0; | ||||
|   virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0; | ||||
|   virtual std::vector<int> Directions(void)   =0; | ||||
|   virtual std::vector<int> Displacements(void)=0; | ||||
| }; | ||||
|  | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -75,8 +73,6 @@ public: | ||||
|   virtual  void MooeeDag    (const Field &in, Field &out)=0; | ||||
|   virtual  void MooeeInvDag (const Field &in, Field &out)=0; | ||||
|  | ||||
|   virtual std::vector<int> Directions(void)   =0; | ||||
|   virtual std::vector<int> Displacements(void)=0; | ||||
| }; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|    /************************************************************************************* | ||||
|     /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -28,7 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG | ||||
| #define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|   /* | ||||
|    * Compared to Tang-2009:  P=Pleft. P^T = PRight Q=MssInv.  | ||||
|    * Script A = SolverMatrix  | ||||
| @@ -51,54 +50,53 @@ NAMESPACE_BEGIN(Grid); | ||||
|    * Vout = x | ||||
|    */ | ||||
|  | ||||
|  | ||||
| template<class Field, class CoarseField, class Aggregates> | ||||
| // abstract base | ||||
| template<class Field, class CoarseField> | ||||
| class TwoLevelFlexiblePcg : public LinearFunction<Field> | ||||
| { | ||||
|  public: | ||||
|  | ||||
|   int verbose; | ||||
|  | ||||
|   RealD   Tolerance; | ||||
|   Integer MaxIterations; | ||||
|   const int mmax = 4; | ||||
|   GridBase *FineGrid; | ||||
|   GridBase *CoarseGrid; | ||||
|   const int mmax = 5; | ||||
|   GridBase *grid; | ||||
|   GridBase *coarsegrid; | ||||
|  | ||||
|   LinearOperatorBase<Field>   &_Linop; | ||||
|   LinearFunction<Field>     &_Smoother; | ||||
|   LinearFunction<CoarseField> &_CoarseSolver; | ||||
|   Aggregates                  &_Aggregates; | ||||
|   LinearOperatorBase<Field>   *_Linop | ||||
|   OperatorFunction<Field>     *_Smoother, | ||||
|   LinearFunction<CoarseField> *_CoarseSolver; | ||||
|  | ||||
|   // Need somthing that knows how to get from Coarse to fine and back again | ||||
|    | ||||
|   // more most opertor functions | ||||
|   TwoLevelFlexiblePcg(RealD tol, | ||||
| 		      Integer maxit, | ||||
| 		      LinearOperatorBase<Field> *Linop, | ||||
| 		      LinearFunction<Field>   *Smoother, | ||||
| 		      LinearFunction<CoarseField> *CoarseSolver, | ||||
| 		      Aggregates *AggP | ||||
| 		      ) :  | ||||
|   Tolerance(tol),  | ||||
|     MaxIterations(maxit), | ||||
|     _Linop(*Linop), | ||||
|     _Smoother(*Smoother), | ||||
|     _CoarseSolver(*CoarseSolver), | ||||
|     _Aggregates(*AggP) | ||||
| 		     Integer maxit, | ||||
| 		     LinearOperatorBase<Field> *Linop, | ||||
| 		     LinearOperatorBase<Field> *SmootherLinop, | ||||
| 		     OperatorFunction<Field>   *Smoother, | ||||
| 		     OperatorFunction<CoarseField>  CoarseLinop | ||||
| 		     ) :  | ||||
|       Tolerance(tol),  | ||||
|       MaxIterations(maxit), | ||||
|       _Linop(Linop), | ||||
|       _PreconditionerLinop(PrecLinop), | ||||
|       _Preconditioner(Preconditioner) | ||||
|   {  | ||||
|     CoarseGrid=_Aggregates.CoarseGrid; | ||||
|     FineGrid=_Aggregates.FineGrid; | ||||
|     verbose=0; | ||||
|   }; | ||||
|  | ||||
|   // The Pcg routine is common to all, but the various matrices differ from derived  | ||||
|   // implementation to derived implmentation | ||||
|   void operator() (const Field &src, Field &psi){ | ||||
|   void operator() (const Field &src, Field &psi){ | ||||
|  | ||||
|     psi.Checkerboard() = src.Checkerboard(); | ||||
|     grid             = src.Grid(); | ||||
|  | ||||
|     RealD f; | ||||
|     RealD rtzp,rtz,a,d,b; | ||||
|     //    RealD rptzp; | ||||
|     //    RealD tn; | ||||
|     RealD rptzp; | ||||
|     RealD tn; | ||||
|     RealD guess = norm2(psi); | ||||
|     RealD ssq   = norm2(src); | ||||
|     RealD rsq   = ssq*Tolerance*Tolerance; | ||||
| @@ -106,15 +104,15 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field> | ||||
|     ///////////////////////////// | ||||
|     // Set up history vectors | ||||
|     ///////////////////////////// | ||||
|     std::vector<Field> p  (mmax,FineGrid); | ||||
|     std::vector<Field> mmp(mmax,FineGrid); | ||||
|     std::vector<Field> p  (mmax,grid); | ||||
|     std::vector<Field> mmp(mmax,grid); | ||||
|     std::vector<RealD> pAp(mmax); | ||||
|  | ||||
|     Field x  (FineGrid); x = psi; | ||||
|     Field z  (FineGrid); | ||||
|     Field tmp(FineGrid); | ||||
|     Field r  (FineGrid); | ||||
|     Field mu (FineGrid); | ||||
|     Field x  (grid); x = psi; | ||||
|     Field z  (grid); | ||||
|     Field tmp(grid); | ||||
|     Field r  (grid); | ||||
|     Field mu (grid); | ||||
|    | ||||
|     ////////////////////////// | ||||
|     // x0 = Vstart -- possibly modify guess | ||||
| @@ -123,13 +121,13 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field> | ||||
|     Vstart(x,src); | ||||
|  | ||||
|     // r0 = b -A x0 | ||||
|     _Linop.HermOp(x,mmp[0]); // Shouldn't this be something else? | ||||
|     HermOp(x,mmp); // Shouldn't this be something else? | ||||
|     axpy (r, -1.0,mmp[0], src);    // Recomputes r=src-Ax0 | ||||
|  | ||||
|     ////////////////////////////////// | ||||
|     // Compute z = M1 x | ||||
|     ////////////////////////////////// | ||||
|     M1(r,z); | ||||
|     M1(r,z,tmp,mp,SmootherMirs); | ||||
|     rtzp =real(innerProduct(r,z)); | ||||
|  | ||||
|     /////////////////////////////////////// | ||||
| @@ -145,7 +143,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field> | ||||
|       int peri_kp = (k+1) % mmax; | ||||
|  | ||||
|       rtz=rtzp; | ||||
|       d= M3(p[peri_k],mmp[peri_k]); | ||||
|       d= M3(p[peri_k],mp,mmp[peri_k],tmp); | ||||
|       a = rtz/d; | ||||
|      | ||||
|       // Memorise this | ||||
| @@ -155,13 +153,13 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field> | ||||
|       RealD rn = axpy_norm(r,-a,mmp[peri_k],r); | ||||
|  | ||||
|       // Compute z = M x | ||||
|       M1(r,z); | ||||
|       M1(r,z,tmp,mp); | ||||
|  | ||||
|       rtzp =real(innerProduct(r,z)); | ||||
|  | ||||
|       M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate | ||||
|  | ||||
|       p[peri_kp]=mu; | ||||
|       p[peri_kp]=p[peri_k]; | ||||
|  | ||||
|       // Standard search direction  p -> z + b p    ; b =  | ||||
|       b = (rtzp)/rtz; | ||||
| @@ -183,7 +181,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field> | ||||
|       // Stopping condition | ||||
|       if ( rn <= rsq ) {  | ||||
|  | ||||
| 	_Linop.HermOp(x,mmp[0]); // Shouldn't this be something else? | ||||
| 	HermOp(x,mmp); // Shouldn't this be something else? | ||||
| 	axpy(tmp,-1.0,src,mmp[0]); | ||||
| 	 | ||||
| 	RealD psinorm = sqrt(norm2(x)); | ||||
| @@ -192,8 +190,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field> | ||||
| 	RealD true_residual = tmpnorm/srcnorm; | ||||
| 	std::cout<<GridLogMessage<<"TwoLevelfPcg:   true residual is "<<true_residual<<std::endl; | ||||
| 	std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl; | ||||
|  | ||||
| 	return; | ||||
| 	return k; | ||||
|       } | ||||
|     } | ||||
|     // Non-convergence | ||||
| @@ -202,40 +199,48 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field> | ||||
|  | ||||
|  public: | ||||
|  | ||||
|   virtual void M1(Field & in, Field & out)  | ||||
|   {// the smoother | ||||
|   virtual void M(Field & in,Field & out,Field & tmp) { | ||||
|  | ||||
|   } | ||||
|  | ||||
|   virtual void M1(Field & in, Field & out) {// the smoother | ||||
|  | ||||
|     // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min] | ||||
|     Field tmp(FineGrid); | ||||
|     Field Min(FineGrid); | ||||
|     Field tmp(grid); | ||||
|     Field Min(grid); | ||||
|  | ||||
|     CoarseField PleftProj(CoarseGrid); | ||||
|     CoarseField PleftMss_proj(CoarseGrid); | ||||
|     PcgM(in,Min); // Smoother call | ||||
|  | ||||
|     _Smoother(in,Min); // Smoother call | ||||
|  | ||||
|     _Linop.HermOp(Min,out); | ||||
|     HermOp(Min,out); | ||||
|     axpy(tmp,-1.0,out,in);          // tmp  = in - A Min | ||||
|  | ||||
|     _Aggregates.ProjectToSubspace(PleftProj,tmp);      | ||||
|     _CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s | ||||
|     _Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]   | ||||
|     ProjectToSubspace(tmp,PleftProj);      | ||||
|     ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s | ||||
|     PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]   | ||||
|     axpy(out,1.0,Min,tmp); // Min+tmp | ||||
|   } | ||||
|  | ||||
|   virtual void M2(const Field & in, Field & out)  | ||||
|   { | ||||
|   virtual void M2(const Field & in, Field & out) { | ||||
|     out=in; | ||||
|     // Must override for Def2 only | ||||
|     //  case PcgDef2: | ||||
|     //    Pright(in,out); | ||||
|     //    break; | ||||
|   } | ||||
|  | ||||
|   virtual RealD M3(const Field & p, Field & mmp) | ||||
|   { | ||||
|   virtual RealD M3(const Field & p, Field & mmp){ | ||||
|     double d,dd; | ||||
|     _Linop.HermOpAndNorm(p,mmp,d,dd); | ||||
|     HermOpAndNorm(p,mmp,d,dd); | ||||
|     return dd; | ||||
|     // Must override for Def1 only | ||||
|     //  case PcgDef1: | ||||
|     //    d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no | ||||
|     //      linop_d->Mprec(mmp,mp,tmp,1);// Dag yes | ||||
|     //    Pleft(mp,mmp); | ||||
|     //    d=real(linop_d->inner(p,mmp)); | ||||
|   } | ||||
|  | ||||
|   virtual void Vstart(Field & x,const Field & src) | ||||
|   { | ||||
|   virtual void VstartDef2(Field & xconst Field & src){ | ||||
|     //case PcgDef2: | ||||
|     //case PcgAdef2:  | ||||
|     //case PcgAdef2f: | ||||
| @@ -251,79 +256,142 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field> | ||||
|     //                   = src_s - (A guess)_s - src_s  + (A guess)_s  | ||||
|     //                   = 0  | ||||
|     /////////////////////////////////// | ||||
|     Field r(FineGrid); | ||||
|     Field mmp(FineGrid); | ||||
|  | ||||
|     CoarseField PleftProj(CoarseGrid); | ||||
|     CoarseField PleftMss_proj(CoarseGrid); | ||||
|     Field r(grid); | ||||
|     Field mmp(grid); | ||||
|      | ||||
|     _Linop.HermOp(x,mmp); | ||||
|     HermOp(x,mmp); | ||||
|     axpy (r, -1.0, mmp, src);        // r_{-1} = src - A x | ||||
|     _Aggregates.ProjectToSubspace(PleftProj,r);      | ||||
|     _CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} r_s | ||||
|     _Aggregates.PromoteFromSubspace(PleftMss_proj,mmp);   | ||||
|     ProjectToSubspace(r,PleftProj);      | ||||
|     ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s | ||||
|     PromoteFromSubspace(PleftMss_proj,mmp);   | ||||
|     x=x+mmp; | ||||
|  | ||||
|   } | ||||
|  | ||||
|   virtual void Vstart(Field & x,const Field & src){ | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   ///////////////////////////////////////////////////////////////////// | ||||
|   // Only Def1 has non-trivial Vout. Override in Def1 | ||||
|   ///////////////////////////////////////////////////////////////////// | ||||
|   virtual void   Vout  (Field & in, Field & out,Field & src){ | ||||
|     out = in; | ||||
|     //case PcgDef1: | ||||
|     //    //Qb + PT x | ||||
|     //    ProjectToSubspace(src,PleftProj);      | ||||
|     //    ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s | ||||
|     //    PromoteFromSubspace(PleftMss_proj,tmp);   | ||||
|     //     | ||||
|     //    Pright(in,out); | ||||
|     //     | ||||
|     //    linop_d->axpy(out,tmp,out,1.0); | ||||
|     //    break; | ||||
|   } | ||||
|  | ||||
|   //////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Pright and Pleft are common to all implementations | ||||
|   //////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   virtual void Pright(Field & in,Field & out) | ||||
|   { | ||||
|   virtual void Pright(Field & in,Field & out){ | ||||
|     // P_R  = [ 1              0 ]  | ||||
|     //        [ -Mss^-1 Msb    0 ]  | ||||
|     Field in_sbar(FineGrid); | ||||
|     Field in_sbar(grid); | ||||
|  | ||||
|     CoarseField PleftProj(CoarseGrid); | ||||
|     CoarseField PleftMss_proj(CoarseGrid); | ||||
|  | ||||
|     _Aggregates.ProjectToSubspace(PleftProj,in);      | ||||
|     _Aggregates.PromoteFromSubspace(PleftProj,out);   | ||||
|     ProjectToSubspace(in,PleftProj);      | ||||
|     PromoteFromSubspace(PleftProj,out);   | ||||
|     axpy(in_sbar,-1.0,out,in);       // in_sbar = in - in_s  | ||||
|  | ||||
|     _Linop.HermOp(in_sbar,out); | ||||
|     _Aggregates.ProjectToSubspace(PleftProj,out);           // Mssbar in_sbar  (project) | ||||
|     HermOp(in_sbar,out); | ||||
|     ProjectToSubspace(out,PleftProj);           // Mssbar in_sbar  (project) | ||||
|  | ||||
|     _CoarseSolver(PleftProj,PleftMss_proj); // Mss^{-1} Mssbar  | ||||
|     _Aggregates.PromoteFromSubspace(PleftMss_proj,out);     //  | ||||
|     ApplyInverse     (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar  | ||||
|     PromoteFromSubspace(PleftMss_proj,out);     //  | ||||
|  | ||||
|     axpy(out,-1.0,out,in_sbar);     // in_sbar - Mss^{-1} Mssbar in_sbar | ||||
|   } | ||||
|   virtual void Pleft (Field & in,Field & out) | ||||
|   { | ||||
|   virtual void Pleft (Field & in,Field & out){ | ||||
|     // P_L  = [ 1  -Mbs Mss^-1]  | ||||
|     //        [ 0   0         ]  | ||||
|     Field in_sbar(FineGrid); | ||||
|     Field    tmp2(FineGrid); | ||||
|     Field    Mtmp(FineGrid); | ||||
|     Field in_sbar(grid); | ||||
|     Field    tmp2(grid); | ||||
|     Field    Mtmp(grid); | ||||
|  | ||||
|     CoarseField PleftProj(CoarseGrid); | ||||
|     CoarseField PleftMss_proj(CoarseGrid); | ||||
|  | ||||
|     _Aggregates.ProjectToSubspace(PleftProj,in);      | ||||
|     _Aggregates.PromoteFromSubspace(PleftProj,out);   | ||||
|     ProjectToSubspace(in,PleftProj);      | ||||
|     PromoteFromSubspace(PleftProj,out);   | ||||
|     axpy(in_sbar,-1.0,out,in);      // in_sbar = in - in_s | ||||
|  | ||||
|     _CoarseSolver(PleftProj,PleftMss_proj); // Mss^{-1} in_s | ||||
|     _Aggregates.PromoteFromSubspace(PleftMss_proj,out); | ||||
|     ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s | ||||
|     PromoteFromSubspace(PleftMss_proj,out); | ||||
|  | ||||
|     _Linop.HermOp(out,Mtmp); | ||||
|     HermOp(out,Mtmp); | ||||
|  | ||||
|     _Aggregates.ProjectToSubspace(PleftProj,Mtmp);      // Msbar s Mss^{-1} | ||||
|     _Aggregates.PromoteFromSubspace(PleftProj,tmp2); | ||||
|     ProjectToSubspace(Mtmp,PleftProj);      // Msbar s Mss^{-1} | ||||
|     PromoteFromSubspace(PleftProj,tmp2); | ||||
|  | ||||
|     axpy(out,-1.0,tmp2,Mtmp); | ||||
|     axpy(out,-1.0,out,in_sbar);     // in_sbar - Msbars Mss^{-1} in_s | ||||
|   } | ||||
| }; | ||||
| NAMESPACE_END(Grid); | ||||
| } | ||||
|  | ||||
| template<class Field> | ||||
| class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> { | ||||
|  public: | ||||
|   virtual void M(Field & in,Field & out,Field & tmp){ | ||||
|  | ||||
|   }  | ||||
|   virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){ | ||||
|  | ||||
|   } | ||||
|   virtual void M2(Field & in, Field & out){ | ||||
|  | ||||
|   } | ||||
|   virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){ | ||||
|  | ||||
|   } | ||||
|   virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){ | ||||
|  | ||||
|   } | ||||
| } | ||||
| /* | ||||
| template<class Field> | ||||
| class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> { | ||||
|  public: | ||||
|   virtual void M(Field & in,Field & out,Field & tmp);  | ||||
|   virtual void M1(Field & in, Field & out,Field & tmp,Field & mp); | ||||
|   virtual void M2(Field & in, Field & out); | ||||
|   virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp); | ||||
|   virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp); | ||||
| } | ||||
|  | ||||
| template<class Field> | ||||
| class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> { | ||||
|  public: | ||||
|   virtual void M(Field & in,Field & out,Field & tmp);  | ||||
|   virtual void M1(Field & in, Field & out,Field & tmp,Field & mp); | ||||
|   virtual void M2(Field & in, Field & out); | ||||
|   virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp); | ||||
|   virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp); | ||||
|   virtual void   Vout  (Field & in, Field & out,Field & src,Field & tmp); | ||||
| } | ||||
|  | ||||
| template<class Field> | ||||
| class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> { | ||||
|  public: | ||||
|   virtual void M(Field & in,Field & out,Field & tmp);  | ||||
|   virtual void M1(Field & in, Field & out,Field & tmp,Field & mp); | ||||
|   virtual void M2(Field & in, Field & out); | ||||
|   virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp); | ||||
|   virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp); | ||||
| } | ||||
|  | ||||
| template<class Field> | ||||
| class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> { | ||||
|  public: | ||||
|   virtual void M(Field & in,Field & out,Field & tmp);  | ||||
|   virtual void M1(Field & in, Field & out,Field & tmp,Field & mp); | ||||
|   virtual void M2(Field & in, Field & out); | ||||
|   virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp); | ||||
|   virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp); | ||||
| } | ||||
| */ | ||||
| #endif | ||||
|   | ||||
| @@ -60,8 +60,6 @@ public: | ||||
|   DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {}; | ||||
|  | ||||
|   virtual void operator()(const Field &src,Field &guess) { | ||||
|     RealD t=-usecond(); | ||||
|      | ||||
|     guess = Zero(); | ||||
|     assert(evec.size()==eval.size()); | ||||
|     auto N = evec.size(); | ||||
| @@ -70,8 +68,6 @@ public: | ||||
|       axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess); | ||||
|     } | ||||
|     guess.Checkerboard() = src.Checkerboard(); | ||||
|     t+=usecond(); | ||||
|     std::cout<<GridLogMessage<<"\t\t\t" << "Deflated guess took "<< t/1000.0<< "ms" <<std::endl; | ||||
|   } | ||||
| }; | ||||
|  | ||||
|   | ||||
| @@ -59,7 +59,7 @@ public: | ||||
|     GridBase *grid = src.Grid(); | ||||
|     Field r(grid),  p(grid), Ap(grid), Ar(grid), z(grid); | ||||
|        | ||||
|     psi=Zero(); | ||||
|     psi=zero; | ||||
|     r  = src; | ||||
|     Preconditioner(r,p); | ||||
|  | ||||
|   | ||||
| @@ -53,11 +53,7 @@ public: | ||||
|   {  | ||||
|     size_type bytes = __n*sizeof(_Tp); | ||||
|     profilerAllocate(bytes); | ||||
| #ifdef GRID_UVM | ||||
|     _Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes); | ||||
| #else  | ||||
|     _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); | ||||
| #endif | ||||
|     assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); | ||||
|     return ptr; | ||||
|   } | ||||
| @@ -66,11 +62,7 @@ public: | ||||
|   {  | ||||
|     size_type bytes = __n * sizeof(_Tp); | ||||
|     profilerFree(bytes); | ||||
| #ifdef GRID_UVM | ||||
|     MemoryManager::SharedFree((void *)__p,bytes); | ||||
| #else | ||||
|     MemoryManager::CpuFree((void *)__p,bytes); | ||||
| #endif | ||||
|   } | ||||
|  | ||||
|   // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop | ||||
|   | ||||
| @@ -9,13 +9,11 @@ NAMESPACE_BEGIN(Grid); | ||||
| #define AccSmall (3) | ||||
| #define Shared   (4) | ||||
| #define SharedSmall (5) | ||||
| uint64_t total_cache; | ||||
| uint64_t total_shared; | ||||
| uint64_t total_device; | ||||
| uint64_t total_host;; | ||||
| void MemoryManager::PrintBytes(void) | ||||
| { | ||||
|   std::cout << " MemoryManager : "<<total_cache <<" cache       bytes "<<std::endl; | ||||
|   std::cout << " MemoryManager : "<<total_shared<<" shared      bytes "<<std::endl; | ||||
|   std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl; | ||||
|   std::cout << " MemoryManager : "<<total_host  <<" cpu         bytes "<<std::endl; | ||||
| @@ -37,8 +35,6 @@ void *MemoryManager::AcceleratorAllocate(size_t bytes) | ||||
|   if ( ptr == (void *) NULL ) { | ||||
|     ptr = (void *) acceleratorAllocDevice(bytes); | ||||
|     total_device+=bytes; | ||||
|   } else { | ||||
|     //    std::cout <<"AcceleratorAllocate: cache hit Device pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl; | ||||
|   } | ||||
|   return ptr; | ||||
| } | ||||
| @@ -57,10 +53,8 @@ void *MemoryManager::SharedAllocate(size_t bytes) | ||||
|   if ( ptr == (void *) NULL ) { | ||||
|     ptr = (void *) acceleratorAllocShared(bytes); | ||||
|     total_shared+=bytes; | ||||
|     //    std::cout <<"SharedAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl; | ||||
|     //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl; | ||||
|     //    PrintBytes(); | ||||
|   } else { | ||||
|     //    std::cout <<"SharedAllocate: cache hit Shared pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl; | ||||
|   } | ||||
|   return ptr; | ||||
| } | ||||
| @@ -80,9 +74,6 @@ void *MemoryManager::CpuAllocate(size_t bytes) | ||||
|   if ( ptr == (void *) NULL ) { | ||||
|     ptr = (void *) acceleratorAllocShared(bytes); | ||||
|     total_host+=bytes; | ||||
|     //    std::cout <<"CpuAllocate: allocated Cpu pointer "<<std::hex<<ptr<<std::dec<<std::endl; | ||||
|   } else { | ||||
|     //    std::cout <<"CpufAllocate: cache hit Cpu pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl; | ||||
|   } | ||||
|   return ptr; | ||||
| } | ||||
| @@ -129,7 +120,7 @@ void MemoryManager::Init(void) | ||||
|   str= getenv("GRID_ALLOC_NCACHE_LARGE"); | ||||
|   if ( str ) { | ||||
|     Nc = atoi(str); | ||||
|     if ( (Nc>=0) && (Nc <= NallocCacheMax)) { | ||||
|     if ( (Nc>=0) && (Nc < NallocCacheMax)) { | ||||
|       Ncache[Cpu]=Nc; | ||||
|       Ncache[Acc]=Nc; | ||||
|       Ncache[Shared]=Nc; | ||||
| @@ -139,7 +130,7 @@ void MemoryManager::Init(void) | ||||
|   str= getenv("GRID_ALLOC_NCACHE_SMALL"); | ||||
|   if ( str ) { | ||||
|     Nc = atoi(str); | ||||
|     if ( (Nc>=0) && (Nc <= NallocCacheMax)) { | ||||
|     if ( (Nc>=0) && (Nc < NallocCacheMax)) { | ||||
|       Ncache[CpuSmall]=Nc; | ||||
|       Ncache[AccSmall]=Nc; | ||||
|       Ncache[SharedSmall]=Nc; | ||||
| @@ -220,7 +211,6 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries | ||||
|  | ||||
|   if ( entries[v].valid ) { | ||||
|     ret = entries[v].address; | ||||
|     total_cache-=entries[v].bytes; | ||||
|     entries[v].valid = 0; | ||||
|     entries[v].address = NULL; | ||||
|     entries[v].bytes = 0; | ||||
| @@ -229,7 +219,6 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries | ||||
|   entries[v].address=ptr; | ||||
|   entries[v].bytes  =bytes; | ||||
|   entries[v].valid  =1; | ||||
|   total_cache+=entries[v].bytes; | ||||
|  | ||||
|   return ret; | ||||
| } | ||||
| @@ -254,7 +243,6 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach | ||||
|   for(int e=0;e<ncache;e++){ | ||||
|     if ( entries[e].valid && ( entries[e].bytes == bytes ) ) { | ||||
|       entries[e].valid = 0; | ||||
|       total_cache-=bytes; | ||||
|       return entries[e].address; | ||||
|     } | ||||
|   } | ||||
|   | ||||
| @@ -93,8 +93,8 @@ private: | ||||
|   static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ; | ||||
|   static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ; | ||||
|  | ||||
|  public: | ||||
|   static void PrintBytes(void); | ||||
|  public: | ||||
|   static void Init(void); | ||||
|   static void InitMessage(void); | ||||
|   static void *AcceleratorAllocate(size_t bytes); | ||||
|   | ||||
| @@ -138,21 +138,6 @@ public: | ||||
| 		      int recv_from_rank, | ||||
| 		      int bytes); | ||||
|    | ||||
|   void SendRecvPacket(void *xmit, | ||||
| 		      void *recv, | ||||
| 		      int xmit_to_rank, | ||||
| 		      int recv_from_rank, | ||||
| 		      int bytes); | ||||
|    | ||||
|   void SendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||
| 			   void *xmit, | ||||
| 			   int xmit_to_rank, | ||||
| 			   void *recv, | ||||
| 			   int recv_from_rank, | ||||
| 			   int bytes); | ||||
|    | ||||
|   void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); | ||||
|  | ||||
|   double StencilSendToRecvFrom(void *xmit, | ||||
| 			       int xmit_to_rank, | ||||
| 			       void *recv, | ||||
|   | ||||
| @@ -77,15 +77,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){} | ||||
| void CartesianCommunicator::GlobalXOR(uint32_t &){} | ||||
| void CartesianCommunicator::GlobalXOR(uint64_t &){} | ||||
|  | ||||
| void CartesianCommunicator::SendRecvPacket(void *xmit, | ||||
| 					   void *recv, | ||||
| 					   int xmit_to_rank, | ||||
| 					   int recv_from_rank, | ||||
| 					   int bytes) | ||||
| { | ||||
|   assert(0); | ||||
| } | ||||
|  | ||||
|  | ||||
| // Basic Halo comms primitive -- should never call in single node | ||||
| void CartesianCommunicator::SendToRecvFrom(void *xmit, | ||||
| @@ -96,20 +87,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, | ||||
| { | ||||
|   assert(0); | ||||
| } | ||||
| void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||
| 						void *xmit, | ||||
| 						int dest, | ||||
| 						void *recv, | ||||
| 						int from, | ||||
| 						int bytes) | ||||
| { | ||||
|   assert(0); | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||
| { | ||||
|   assert(0); | ||||
| } | ||||
| void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes) | ||||
| { | ||||
|   bcopy(in,out,bytes*words); | ||||
| @@ -137,10 +114,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, | ||||
| 						     int recv_from_rank, | ||||
| 						     int bytes, int dir) | ||||
| { | ||||
|   std::vector<CommsRequest_t> list; | ||||
|   // Discard the "dir" | ||||
|   SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); | ||||
|   SendToRecvFromComplete(list); | ||||
|   return 2.0*bytes; | ||||
| } | ||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||
| @@ -150,13 +123,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | ||||
| 							 int recv_from_rank, | ||||
| 							 int bytes, int dir) | ||||
| { | ||||
|   // Discard the "dir" | ||||
|   SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); | ||||
|   return 2.0*bytes; | ||||
| } | ||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | ||||
| { | ||||
|   SendToRecvFromComplete(waitall); | ||||
| } | ||||
|  | ||||
| void CartesianCommunicator::StencilBarrier(void){}; | ||||
|   | ||||
| @@ -32,6 +32,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #ifdef GRID_CUDA | ||||
| #include <cuda_runtime_api.h> | ||||
| #endif | ||||
| #ifdef GRID_HIP | ||||
| #include <hip/hip_runtime_api.h> | ||||
| #endif | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid);  | ||||
| #define header "SharedMemoryMpi: " | ||||
| @@ -425,7 +428,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| //////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Hugetlbfs mapping intended | ||||
| //////////////////////////////////////////////////////////////////////////////////////////// | ||||
| #ifdef GRID_CUDA | ||||
| #if defined(GRID_CUDA) ||defined(GRID_HIP) | ||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
| { | ||||
|   void * ShmCommBuf ;  | ||||
| @@ -448,21 +451,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|   // Each MPI rank should allocate our own buffer | ||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| #ifndef GRID_MPI3_SHM_NONE | ||||
|   auto err =  cudaMalloc(&ShmCommBuf, bytes); | ||||
| #else | ||||
|   auto err =  cudaMallocManaged(&ShmCommBuf, bytes); | ||||
| #endif | ||||
|   if ( err !=  cudaSuccess) { | ||||
|     std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl; | ||||
|     exit(EXIT_FAILURE);   | ||||
|   } | ||||
|   ShmCommBuf = acceleratorAllocDevice(bytes); | ||||
|  | ||||
|   if (ShmCommBuf == (void *)NULL ) { | ||||
|     std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl; | ||||
|     std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; | ||||
|     exit(EXIT_FAILURE);   | ||||
|   } | ||||
|   if ( WorldRank == 0 ){ | ||||
|     std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; | ||||
|     std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes  | ||||
| 	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; | ||||
|   } | ||||
|   SharedMemoryZero(ShmCommBuf,bytes); | ||||
|  | ||||
| @@ -475,15 +472,26 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|     ////////////////////////////////////////////////// | ||||
|     // If it is me, pass around the IPC access key | ||||
|     ////////////////////////////////////////////////// | ||||
| #ifdef GRID_CUDA | ||||
|     cudaIpcMemHandle_t handle; | ||||
|      | ||||
|     if ( r==WorldShmRank ) {  | ||||
|       err = cudaIpcGetMemHandle(&handle,ShmCommBuf); | ||||
|       auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf); | ||||
|       if ( err !=  cudaSuccess) { | ||||
| 	std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; | ||||
| 	exit(EXIT_FAILURE); | ||||
|       } | ||||
|     } | ||||
| #endif | ||||
| #ifdef GRID_HIP | ||||
|     hipIpcMemHandle_t handle;     | ||||
|     if ( r==WorldShmRank ) {  | ||||
|       auto err = hipIpcGetMemHandle(&handle,ShmCommBuf); | ||||
|       if ( err !=  hipSuccess) { | ||||
| 	std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl; | ||||
| 	exit(EXIT_FAILURE); | ||||
|       } | ||||
|     } | ||||
| #endif | ||||
|     ////////////////////////////////////////////////// | ||||
|     // Share this IPC handle across the Shm Comm | ||||
|     ////////////////////////////////////////////////// | ||||
| @@ -500,13 +508,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||
|     // If I am not the source, overwrite thisBuf with remote buffer | ||||
|     /////////////////////////////////////////////////////////////// | ||||
|     void * thisBuf = ShmCommBuf; | ||||
| #ifdef GRID_CUDA | ||||
|     if ( r!=WorldShmRank ) {  | ||||
|       err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess); | ||||
|       auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess); | ||||
|       if ( err !=  cudaSuccess) { | ||||
| 	std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; | ||||
| 	exit(EXIT_FAILURE); | ||||
|       } | ||||
|     } | ||||
| #endif | ||||
| #ifdef GRID_HIP | ||||
|     if ( r!=WorldShmRank ) {  | ||||
|       auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess); | ||||
|       if ( err !=  hipSuccess) { | ||||
| 	std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl; | ||||
| 	exit(EXIT_FAILURE); | ||||
|       } | ||||
|     } | ||||
| #endif | ||||
|     /////////////////////////////////////////////////////////////// | ||||
|     // Save a copy of the device buffers | ||||
|     /////////////////////////////////////////////////////////////// | ||||
|   | ||||
| @@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #include <Grid/lattice/Lattice_local.h> | ||||
| #include <Grid/lattice/Lattice_reduction.h> | ||||
| #include <Grid/lattice/Lattice_peekpoke.h> | ||||
| #include <Grid/lattice/Lattice_reality.h> | ||||
| //#include <Grid/lattice/Lattice_reality.h> | ||||
| #include <Grid/lattice/Lattice_real_imag.h> | ||||
| #include <Grid/lattice/Lattice_comparison_utils.h> | ||||
| #include <Grid/lattice/Lattice_comparison.h> | ||||
|   | ||||
| @@ -60,9 +60,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ | ||||
|   autoView( lhs_v , lhs, AcceleratorRead); | ||||
|   autoView( rhs_v , rhs, AcceleratorRead); | ||||
|   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     auto lhs_t=lhs_v(ss); | ||||
|     auto rhs_t=rhs_v(ss); | ||||
|     auto tmp  =ret_v(ss); | ||||
|     mac(&tmp,&lhs_t,&rhs_t); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   }); | ||||
| @@ -124,7 +124,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( lhs_v , lhs, AcceleratorRead); | ||||
|   accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     auto tmp  =ret_v(ss); | ||||
|     auto lhs_t=lhs_v(ss); | ||||
|     mac(&tmp,&lhs_t,&rhs); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
| @@ -182,7 +182,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ | ||||
|   autoView( ret_v , ret, AcceleratorWrite); | ||||
|   autoView( rhs_v , lhs, AcceleratorRead); | ||||
|   accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ | ||||
|     decltype(coalescedRead(obj1())) tmp; | ||||
|     auto tmp  =ret_v(ss); | ||||
|     auto rhs_t=rhs_v(ss); | ||||
|     mac(&tmp,&lhs,&rhs_t); | ||||
|     coalescedWrite(ret_v[ss],tmp); | ||||
|   | ||||
| @@ -2,12 +2,13 @@ NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| #ifdef GRID_HIP | ||||
| extern hipDeviceProp_t *gpu_props; | ||||
| #define WARP_SIZE 64 | ||||
| #endif | ||||
| #ifdef GRID_CUDA | ||||
| extern cudaDeviceProp *gpu_props; | ||||
| #define WARP_SIZE 32 | ||||
| #endif | ||||
|  | ||||
| #define WARP_SIZE 32 | ||||
| __device__ unsigned int retirementCount = 0; | ||||
|  | ||||
| template <class Iterator> | ||||
| @@ -64,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid | ||||
|    | ||||
|   // cannot use overloaded operators for sobj as they are not volatile-qualified | ||||
|   memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj)); | ||||
|   __syncwarp(); | ||||
|   acceleratorSynchronise(); | ||||
|    | ||||
|   const Iterator VEC = WARP_SIZE; | ||||
|   const Iterator vid = tid & (VEC-1); | ||||
| @@ -78,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid | ||||
|       beta += temp; | ||||
|       memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj)); | ||||
|     } | ||||
|     __syncwarp(); | ||||
|     acceleratorSynchronise(); | ||||
|   } | ||||
|   __syncthreads(); | ||||
|   acceleratorSynchroniseAll(); | ||||
|    | ||||
|   if (threadIdx.x == 0) { | ||||
|     beta  = Zero(); | ||||
| @@ -90,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid | ||||
|     } | ||||
|     memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj)); | ||||
|   } | ||||
|   __syncthreads(); | ||||
|   acceleratorSynchroniseAll(); | ||||
| } | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -52,7 +52,6 @@ public: | ||||
| // This will be safe to call from accelerator_for and is trivially copy constructible | ||||
| // The copy constructor for this will need to be used by device lambda functions | ||||
| ///////////////////////////////////////////////////////////////////////////////////////// | ||||
| #undef LATTICE_BOUNDS_CHECK | ||||
| template<class vobj>  | ||||
| class LatticeView : public LatticeAccelerator<vobj> | ||||
| { | ||||
| @@ -62,36 +61,14 @@ public: | ||||
|   void * cpu_ptr; | ||||
| #ifdef GRID_SIMT | ||||
|   accelerator_inline const typename vobj::scalar_object operator()(size_t i) const {  | ||||
| #ifdef LATTICE_BOUNDS_CHECK | ||||
|     assert(i<this->_odata_size); | ||||
|     assert(i>=0); | ||||
| #endif | ||||
|     return coalescedRead(this->_odata[i]);  | ||||
|   } | ||||
| #else  | ||||
|   accelerator_inline const vobj & operator()(size_t i) const { | ||||
| #ifdef LATTICE_BOUNDS_CHECK | ||||
|     assert(i<this->_odata_size); | ||||
|     assert(i>=0); | ||||
| #endif | ||||
|     return this->_odata[i]; | ||||
|   } | ||||
|   accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; } | ||||
| #endif | ||||
|  | ||||
|   accelerator_inline const vobj & operator[](size_t i) const {  | ||||
| #ifdef LATTICE_BOUNDS_CHECK | ||||
|     assert(i<this->_odata_size); | ||||
|     assert(i>=0); | ||||
| #endif | ||||
|     return this->_odata[i];  | ||||
|   }; | ||||
|   accelerator_inline vobj       & operator[](size_t i)       {  | ||||
| #ifdef LATTICE_BOUNDS_CHECK | ||||
|     assert(i<this->_odata_size); | ||||
|     assert(i>=0); | ||||
| #endif | ||||
|     return this->_odata[i];  | ||||
|   }; | ||||
|   accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; | ||||
|   accelerator_inline vobj       & operator[](size_t i)       { return this->_odata[i]; }; | ||||
|  | ||||
|   accelerator_inline uint64_t begin(void) const { return 0;}; | ||||
|   accelerator_inline uint64_t end(void)   const { return this->_odata_size; }; | ||||
|   | ||||
| @@ -130,6 +130,8 @@ public: | ||||
|   friend std::ostream& operator<< (std::ostream& stream, Logger& log){ | ||||
|  | ||||
|     if ( log.active ) { | ||||
|       std::ios_base::fmtflags f(stream.flags()); | ||||
|  | ||||
|       stream << log.background()<<  std::left; | ||||
|       if (log.topWidth > 0) | ||||
|       { | ||||
| @@ -152,6 +154,8 @@ public: | ||||
| 	       << now	       << log.background() << " : " ; | ||||
|       } | ||||
|       stream << log.colour(); | ||||
|       stream.flags(f); | ||||
|  | ||||
|       return stream; | ||||
|     } else {  | ||||
|       return devnull; | ||||
|   | ||||
| @@ -1,3 +1,4 @@ | ||||
| #include <Grid/GridCore.h> | ||||
|  | ||||
| int Grid::BinaryIO::latticeWriteMaxRetry = -1; | ||||
| int                    Grid::BinaryIO::latticeWriteMaxRetry = -1; | ||||
| Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf; | ||||
|   | ||||
| @@ -79,6 +79,13 @@ inline void removeWhitespace(std::string &key) | ||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| class BinaryIO { | ||||
|  public: | ||||
|   struct IoPerf | ||||
|   { | ||||
|     uint64_t size{0},time{0}; | ||||
|     double   mbytesPerSecond{0.}; | ||||
|   }; | ||||
|  | ||||
|   static IoPerf lastPerf; | ||||
|   static int latticeWriteMaxRetry; | ||||
|  | ||||
|   ///////////////////////////////////////////////////////////////////////////// | ||||
| @@ -502,12 +509,15 @@ class BinaryIO { | ||||
|       timer.Stop(); | ||||
|     } | ||||
|      | ||||
|     lastPerf.size            = sizeof(fobj)*iodata.size()*nrank; | ||||
|     lastPerf.time            = timer.useconds(); | ||||
|     lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6); | ||||
|     std::cout<<GridLogMessage<<"IOobject: "; | ||||
|     if ( control & BINARYIO_READ) std::cout << " read  "; | ||||
|     else                          std::cout << " write "; | ||||
|     uint64_t bytes = sizeof(fobj)*iodata.size()*nrank; | ||||
|     std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" " | ||||
| 	     << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl; | ||||
|     std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" " | ||||
| 	     << lastPerf.mbytesPerSecond <<" MB/s "<<std::endl; | ||||
|  | ||||
|     std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed()  <<std::endl; | ||||
|  | ||||
| @@ -663,10 +673,15 @@ class BinaryIO { | ||||
| 	     nersc_csum,scidac_csuma,scidac_csumb); | ||||
|  | ||||
|     timer.Start(); | ||||
|     thread_for(lidx,lsites,{ | ||||
|     thread_for(lidx,lsites,{  // FIX ME, suboptimal implementation | ||||
|       std::vector<RngStateType> tmp(RngStateCount); | ||||
|       std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); | ||||
|       parallel_rng.SetState(tmp,lidx); | ||||
|       Coordinate lcoor; | ||||
|       grid->LocalIndexToLocalCoor(lidx, lcoor); | ||||
|       int o_idx=grid->oIndex(lcoor); | ||||
|       int i_idx=grid->iIndex(lcoor); | ||||
|       int gidx=parallel_rng.generator_idx(o_idx,i_idx); | ||||
|       parallel_rng.SetState(tmp,gidx); | ||||
|       }); | ||||
|     timer.Stop(); | ||||
|  | ||||
| @@ -723,7 +738,12 @@ class BinaryIO { | ||||
|     std::vector<RNGstate> iodata(lsites); | ||||
|     thread_for(lidx,lsites,{ | ||||
|       std::vector<RngStateType> tmp(RngStateCount); | ||||
|       parallel_rng.GetState(tmp,lidx); | ||||
|       Coordinate lcoor; | ||||
|       grid->LocalIndexToLocalCoor(lidx, lcoor); | ||||
|       int o_idx=grid->oIndex(lcoor); | ||||
|       int i_idx=grid->iIndex(lcoor); | ||||
|       int gidx=parallel_rng.generator_idx(o_idx,i_idx); | ||||
|       parallel_rng.GetState(tmp,gidx); | ||||
|       std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); | ||||
|     }); | ||||
|     timer.Stop(); | ||||
|   | ||||
| @@ -47,7 +47,7 @@ static constexpr int Ym = 5; | ||||
| static constexpr int Zm = 6; | ||||
| static constexpr int Tm = 7; | ||||
|  | ||||
| static constexpr int Nc=3; | ||||
| static constexpr int Nc=Config_Nc; | ||||
| static constexpr int Ns=4; | ||||
| static constexpr int Nd=4; | ||||
| static constexpr int Nhs=2; // half spinor | ||||
| @@ -77,16 +77,9 @@ const int SpinorIndex = 2; | ||||
| template<typename T> struct isSpinor { | ||||
|   static constexpr bool value = (SpinorIndex==T::TensorLevel); | ||||
| }; | ||||
| const int CoarseIndex = 4; | ||||
| template<typename T> struct isCoarsened { | ||||
|   static constexpr bool value = (CoarseIndex<=T::TensorLevel); | ||||
| }; | ||||
| template <typename T> using IfSpinor    = Invoke<std::enable_if< isSpinor<T>::value,int> > ; | ||||
| template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ; | ||||
|  | ||||
| template <typename T> using IfCoarsened    = Invoke<std::enable_if< isCoarsened<T>::value,int> > ; | ||||
| template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ; | ||||
|  | ||||
| // ChrisK very keen to add extra space for Gparity doubling. | ||||
| // | ||||
| // Also add domain wall index, in a way where Wilson operator  | ||||
|   | ||||
| @@ -89,8 +89,7 @@ public: | ||||
|   virtual void  Mdiag  (const FermionField &in, FermionField &out) { Mooee(in,out);};   // Same as Mooee applied to both CB's | ||||
|   virtual void  Mdir   (const FermionField &in, FermionField &out,int dir,int disp)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac | ||||
|   virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out)=0;   // case by case Wilson, Clover, Cayley, ContFrac, PartFrac | ||||
|   virtual std::vector<int> Directions(void)   =0; | ||||
|   virtual std::vector<int> Displacements(void)=0; | ||||
|  | ||||
|  | ||||
|   virtual void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);}; | ||||
|  | ||||
|   | ||||
| @@ -44,9 +44,6 @@ public: | ||||
|   INHERIT_IMPL_TYPES(Impl); | ||||
|   typedef StaggeredKernels<Impl> Kernels; | ||||
|  | ||||
|   virtual std::vector<int> Directions(void)   { return this->directions; }; | ||||
|   virtual std::vector<int> Displacements(void){ return this->displacements;}; | ||||
|  | ||||
|   FermionField _tmp; | ||||
|   FermionField &tmp(void) { return _tmp; } | ||||
|  | ||||
|   | ||||
| @@ -49,9 +49,6 @@ public: | ||||
|   INHERIT_IMPL_TYPES(Impl); | ||||
|   typedef StaggeredKernels<Impl> Kernels; | ||||
|  | ||||
|   virtual std::vector<int> Directions(void)   { return this->directions; }; | ||||
|   virtual std::vector<int> Displacements(void){ return this->displacements;}; | ||||
|  | ||||
|   FermionField _tmp; | ||||
|   FermionField &tmp(void) { return _tmp; } | ||||
|  | ||||
|   | ||||
| @@ -47,9 +47,6 @@ public: | ||||
|   FermionField _tmp; | ||||
|   FermionField &tmp(void) { return _tmp; } | ||||
|  | ||||
|   virtual std::vector<int> Directions(void)   { return this->directions; }; | ||||
|   virtual std::vector<int> Displacements(void){ return this->displacements;}; | ||||
|  | ||||
|   //////////////////////////////////////// | ||||
|   // Performance monitoring | ||||
|   //////////////////////////////////////// | ||||
|   | ||||
| @@ -63,17 +63,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub | ||||
|    /////////////////////////////////////////////////////////////////////////////////////// | ||||
|    // Generic Nc kernels | ||||
|    /////////////////////////////////////////////////////////////////////////////////////// | ||||
|    template<int Naik> accelerator_inline | ||||
|    template<int Naik>  | ||||
|    static accelerator_inline | ||||
|    void DhopSiteGeneric(StencilView &st,  | ||||
| 			DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,  | ||||
| 			SiteSpinor * buf, int LLs, int sU,  | ||||
| 			const FermionFieldView &in, FermionFieldView &out,int dag); | ||||
|    template<int Naik> accelerator_inline | ||||
|     | ||||
|    template<int Naik> static accelerator_inline | ||||
|    void DhopSiteGenericInt(StencilView &st,  | ||||
| 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,  | ||||
| 			   SiteSpinor * buf, int LLs, int sU,  | ||||
| 			   const FermionFieldView &in, FermionFieldView &out,int dag); | ||||
|    template<int Naik> accelerator_inline | ||||
|     | ||||
|    template<int Naik> static accelerator_inline | ||||
|    void DhopSiteGenericExt(StencilView &st,  | ||||
| 			   DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, | ||||
| 			   SiteSpinor * buf, int LLs, int sU,  | ||||
| @@ -82,17 +85,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub | ||||
|    /////////////////////////////////////////////////////////////////////////////////////// | ||||
|    // Nc=3 specific kernels | ||||
|    /////////////////////////////////////////////////////////////////////////////////////// | ||||
|    template<int Naik> accelerator_inline | ||||
|     | ||||
|    template<int Naik> static accelerator_inline | ||||
|    void DhopSiteHand(StencilView &st,  | ||||
| 		     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,  | ||||
| 		     SiteSpinor * buf, int LLs, int sU,  | ||||
| 		     const FermionFieldView &in, FermionFieldView &out,int dag); | ||||
|    template<int Naik> accelerator_inline | ||||
|     | ||||
|    template<int Naik> static accelerator_inline | ||||
|    void DhopSiteHandInt(StencilView &st,  | ||||
| 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,  | ||||
| 			SiteSpinor * buf, int LLs, int sU,  | ||||
| 			const FermionFieldView &in, FermionFieldView &out,int dag); | ||||
|    template<int Naik> accelerator_inline | ||||
|     | ||||
|    template<int Naik> static accelerator_inline | ||||
|    void DhopSiteHandExt(StencilView &st,  | ||||
| 			DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,  | ||||
| 			SiteSpinor * buf, int LLs, int sU,  | ||||
| @@ -101,6 +107,7 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub | ||||
|    /////////////////////////////////////////////////////////////////////////////////////// | ||||
|    // Asm Nc=3 specific kernels | ||||
|    /////////////////////////////////////////////////////////////////////////////////////// | ||||
|     | ||||
|    void DhopSiteAsm(StencilView &st,  | ||||
| 		    DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,  | ||||
| 		    SiteSpinor * buf, int LLs, int sU,  | ||||
|   | ||||
| @@ -63,9 +63,6 @@ public: | ||||
|   INHERIT_IMPL_TYPES(Impl); | ||||
|   typedef WilsonKernels<Impl> Kernels; | ||||
|  | ||||
|   virtual std::vector<int> Directions(void)   { return this->directions; }; | ||||
|   virtual std::vector<int> Displacements(void){ return this->displacements;}; | ||||
|  | ||||
|   /////////////////////////////////////////////////////////////// | ||||
|   // Implement the abstract base | ||||
|   /////////////////////////////////////////////////////////////// | ||||
|   | ||||
| @@ -72,9 +72,6 @@ public: | ||||
|   typedef WilsonKernels<Impl> Kernels; | ||||
|   PmuStat stat; | ||||
|  | ||||
|   virtual std::vector<int> Directions(void)   { return this->directions; }; | ||||
|   virtual std::vector<int> Displacements(void){ return this->displacements;}; | ||||
|  | ||||
|   FermionField _tmp; | ||||
|   FermionField &tmp(void) { return _tmp; } | ||||
|  | ||||
|   | ||||
| @@ -79,8 +79,6 @@ public: | ||||
|     _Mat.M(in,tmp); | ||||
|     G5R5(out,tmp); | ||||
|   } | ||||
|   virtual std::vector<int> Directions(void)   { return _Mat.Directions();}; | ||||
|   virtual std::vector<int> Displacements(void){ return _Mat.Displacements();}; | ||||
| }; | ||||
|  | ||||
|  | ||||
| @@ -129,8 +127,6 @@ public: | ||||
|     _Mat.M(in,tmp); | ||||
|     out=g5*tmp; | ||||
|   } | ||||
|   virtual std::vector<int> Directions(void)   { return _Mat.Directions();}; | ||||
|   virtual std::vector<int> Displacements(void){ return _Mat.Displacements();}; | ||||
| }; | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|   | ||||
| @@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, | ||||
|  | ||||
|   PropagatorField tmp(UGrid); | ||||
|   PropagatorField Utmp(UGrid); | ||||
|   LatticeInteger zz (UGrid);   zz=0.0; | ||||
|   PropagatorField zz (UGrid);   zz=0.0; | ||||
|   LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); | ||||
|   for (int s=0;s<Ls;s++) { | ||||
|  | ||||
| @@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in, | ||||
|   PropagatorField tmp(UGrid); | ||||
|   PropagatorField Utmp(UGrid); | ||||
|  | ||||
|   LatticeInteger zz (UGrid);   zz=0.0; | ||||
|   PropagatorField  zz (UGrid);   zz=0.0; | ||||
|   LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); | ||||
|  | ||||
|   for(int s=0;s<Ls;s++){ | ||||
|   | ||||
| @@ -146,7 +146,7 @@ NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
|  | ||||
| template <class Impl> | ||||
| template <int Naik> | ||||
| template <int Naik> accelerator_inline | ||||
| void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st, | ||||
| 					  DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, | ||||
| 					  SiteSpinor *buf, int sF, int sU,  | ||||
| @@ -221,7 +221,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st, | ||||
|  | ||||
|  | ||||
| template <class Impl> | ||||
| template <int Naik> | ||||
| template <int Naik> accelerator_inline | ||||
| void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,  | ||||
| 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, | ||||
| 					     SiteSpinor *buf, int sF, int sU,  | ||||
| @@ -300,7 +300,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, | ||||
|  | ||||
|  | ||||
| template <class Impl> | ||||
| template <int Naik> | ||||
| template <int Naik> accelerator_inline | ||||
| void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st, | ||||
| 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, | ||||
| 					     SiteSpinor *buf, int sF, int sU,  | ||||
|   | ||||
| @@ -78,7 +78,7 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){}; | ||||
| // Int, Ext, Int+Ext cases for comms overlap | ||||
| //////////////////////////////////////////////////////////////////////////////////// | ||||
| template <class Impl> | ||||
| template <int Naik> | ||||
| template <int Naik> accelerator_inline | ||||
| void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,  | ||||
| 					     DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, | ||||
| 					     SiteSpinor *buf, int sF, int sU,  | ||||
| @@ -126,7 +126,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, | ||||
|   // Only contributions from interior of our node | ||||
|   /////////////////////////////////////////////////// | ||||
| template <class Impl> | ||||
| template <int Naik> | ||||
| template <int Naik> accelerator_inline | ||||
| void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,  | ||||
| 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, | ||||
| 						SiteSpinor *buf, int sF, int sU,  | ||||
| @@ -174,7 +174,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, | ||||
|   // Only contributions from exterior of our node | ||||
|   /////////////////////////////////////////////////// | ||||
| template <class Impl> | ||||
| template <int Naik> | ||||
| template <int Naik> accelerator_inline | ||||
| void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,  | ||||
| 						DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, | ||||
| 						SiteSpinor *buf, int sF, int sU, | ||||
| @@ -224,7 +224,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, | ||||
| //////////////////////////////////////////////////////////////////////////////////// | ||||
| // Driving / wrapping routine to select right kernel | ||||
| //////////////////////////////////////////////////////////////////////////////////// | ||||
| template <class Impl> | ||||
| template <class Impl>  | ||||
| void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, | ||||
| 					   int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp) | ||||
| { | ||||
| @@ -253,7 +253,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie | ||||
|       ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag);		\ | ||||
|   }); | ||||
|  | ||||
| template <class Impl> | ||||
| template <class Impl>  | ||||
| void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,  | ||||
| 					  DoubledGaugeField &U, DoubledGaugeField &UUU,  | ||||
| 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior) | ||||
| @@ -293,7 +293,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, | ||||
|   } | ||||
|   assert(0 && " Kernel optimisation case not covered "); | ||||
| } | ||||
| template <class Impl> | ||||
| template <class Impl>  | ||||
| void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,  | ||||
| 				       DoubledGaugeField &U, | ||||
| 				       const FermionField &in, FermionField &out, int dag, int interior,int exterior) | ||||
|   | ||||
| @@ -133,14 +133,14 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu) | ||||
|   pickCheckerboard(Even, CloverTermEven, CloverTerm); | ||||
|   pickCheckerboard(Odd, CloverTermOdd, CloverTerm); | ||||
|  | ||||
|   pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm)); | ||||
|   pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm)); | ||||
|   pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm))); | ||||
|   pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm))); | ||||
|  | ||||
|   pickCheckerboard(Even, CloverTermInvEven, CloverTermInv); | ||||
|   pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv); | ||||
|  | ||||
|   pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv)); | ||||
|   pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv)); | ||||
|   pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv))); | ||||
|   pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv))); | ||||
| } | ||||
|  | ||||
| template <class Impl> | ||||
|   | ||||
| @@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid); | ||||
|   HAND_RESULT_EXT(ss,F) | ||||
|  | ||||
| #define HAND_SPECIALISE_GPARITY(IMPL)					\ | ||||
|   template<> void						\ | ||||
|   template<> accelerator_inline void						\ | ||||
|   WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \ | ||||
| 				    int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ | ||||
|   {									\ | ||||
| @@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid); | ||||
|     HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||
|   }									\ | ||||
| 									\ | ||||
|   template<> void						\ | ||||
|   template<> accelerator_inline void						\ | ||||
|   WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ | ||||
| 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ | ||||
|   {									\ | ||||
| @@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid); | ||||
|     HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||
|   }									\ | ||||
| 									\ | ||||
|   template<> void						\ | ||||
|   template<> accelerator_inline void						\ | ||||
|   WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \ | ||||
| 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ | ||||
|   {									\ | ||||
| @@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid); | ||||
|     HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||
|   }									\ | ||||
| 									\ | ||||
|   template<> void						\ | ||||
|   template<> accelerator_inline void						\ | ||||
|   WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ | ||||
| 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ | ||||
|   {									\ | ||||
| @@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid); | ||||
|     HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||
|   }									\ | ||||
| 									\ | ||||
|   template<> void							\ | ||||
|   template<> accelerator_inline void							\ | ||||
|   WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, \ | ||||
| 				       int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ | ||||
|   {									\ | ||||
| @@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid); | ||||
|     nmu = 0;								\ | ||||
|     HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ | ||||
|   }									\ | ||||
|   template<> void						\ | ||||
|   template<> accelerator_inline void						\ | ||||
|   WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ | ||||
| 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ | ||||
|   {									\ | ||||
|   | ||||
| @@ -495,7 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| template<class Impl> void  | ||||
| template<class Impl> accelerator_inline void  | ||||
| WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, | ||||
| 				  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) | ||||
| { | ||||
| @@ -519,7 +519,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site | ||||
|   HAND_RESULT(ss); | ||||
| } | ||||
|  | ||||
| template<class Impl> | ||||
| template<class Impl>  accelerator_inline | ||||
| void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, | ||||
| 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) | ||||
| { | ||||
| @@ -542,7 +542,7 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView | ||||
|   HAND_RESULT(ss); | ||||
| } | ||||
|  | ||||
| template<class Impl> void  | ||||
| template<class Impl>  accelerator_inline void  | ||||
| WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, | ||||
| 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) | ||||
| { | ||||
| @@ -566,7 +566,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si | ||||
|   HAND_RESULT(ss); | ||||
| } | ||||
|  | ||||
| template<class Impl> | ||||
| template<class Impl> accelerator_inline | ||||
| void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, | ||||
| 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) | ||||
| { | ||||
| @@ -589,7 +589,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi | ||||
|   HAND_RESULT(ss); | ||||
| } | ||||
|  | ||||
| template<class Impl> void  | ||||
| template<class Impl>  accelerator_inline void  | ||||
| WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor  *buf, | ||||
| 					  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) | ||||
| { | ||||
| @@ -614,7 +614,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si | ||||
|   HAND_RESULT_EXT(ss); | ||||
| } | ||||
|  | ||||
| template<class Impl> | ||||
| template<class Impl>  accelerator_inline | ||||
| void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, | ||||
| 						  int ss,int sU,const FermionFieldView &in, FermionFieldView &out) | ||||
| { | ||||
|   | ||||
| @@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip) | ||||
|   //////////////////////////////////////////////////////////////////// | ||||
|   // All legs kernels ; comms then compute | ||||
|   //////////////////////////////////////////////////////////////////// | ||||
| template <class Impl> | ||||
| template <class Impl> accelerator_inline | ||||
| void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, | ||||
| 					     SiteHalfSpinor *buf, int sF, | ||||
| 					     int sU, const FermionFieldView &in, FermionFieldView &out) | ||||
| @@ -140,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV | ||||
|   coalescedWrite(out[sF],result,lane); | ||||
| }; | ||||
|  | ||||
| template <class Impl> | ||||
| template <class Impl> accelerator_inline | ||||
| void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, | ||||
| 					  SiteHalfSpinor *buf, int sF, | ||||
| 					  int sU, const FermionFieldView &in, FermionFieldView &out) | ||||
| @@ -169,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView | ||||
|   //////////////////////////////////////////////////////////////////// | ||||
|   // Interior kernels | ||||
|   //////////////////////////////////////////////////////////////////// | ||||
| template <class Impl> | ||||
| template <class Impl> accelerator_inline | ||||
| void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFieldView &U, | ||||
| 						SiteHalfSpinor *buf, int sF, | ||||
| 						int sU, const FermionFieldView &in, FermionFieldView &out) | ||||
| @@ -197,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st,  DoubledGaugeFi | ||||
|   coalescedWrite(out[sF], result,lane); | ||||
| }; | ||||
|  | ||||
| template <class Impl> | ||||
| template <class Impl> accelerator_inline | ||||
| void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeFieldView &U, | ||||
| 							 SiteHalfSpinor *buf, int sF, | ||||
| 							 int sU, const FermionFieldView &in, FermionFieldView &out) | ||||
| @@ -227,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st,  DoubledGaugeField | ||||
| //////////////////////////////////////////////////////////////////// | ||||
| // Exterior kernels | ||||
| //////////////////////////////////////////////////////////////////// | ||||
| template <class Impl> | ||||
| template <class Impl> accelerator_inline | ||||
| void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFieldView &U, | ||||
| 						SiteHalfSpinor *buf, int sF, | ||||
| 						int sU, const FermionFieldView &in, FermionFieldView &out) | ||||
| @@ -258,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st,  DoubledGaugeFi | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <class Impl> | ||||
| template <class Impl> accelerator_inline | ||||
| void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeFieldView &U, | ||||
| 					     SiteHalfSpinor *buf, int sF, | ||||
| 					     int sU, const FermionFieldView &in, FermionFieldView &out) | ||||
| @@ -290,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st,  DoubledGaugeField | ||||
| }; | ||||
|  | ||||
| #define DhopDirMacro(Dir,spProj,spRecon)	\ | ||||
|   template <class Impl>							\ | ||||
|   template <class Impl> accelerator_inline				\ | ||||
|   void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \ | ||||
| 					 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \ | ||||
|   {									\ | ||||
| @@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm); | ||||
| DhopDirMacro(Zm,spProjZm,spReconZm); | ||||
| DhopDirMacro(Tm,spProjTm,spReconTm); | ||||
|  | ||||
| template <class Impl> | ||||
| template <class Impl> accelerator_inline | ||||
| void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, | ||||
| 				    int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) | ||||
| { | ||||
|   | ||||
| @@ -128,6 +128,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjTm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   hspin(0)=fspin(0)-fspin(2); | ||||
|   hspin(1)=fspin(1)-fspin(3); | ||||
| } | ||||
| @@ -137,50 +138,40 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
|  *  0 0 -1  0 | ||||
|  *  0 0  0 -1 | ||||
|  */ | ||||
|  | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   hspin(0)=fspin(0); | ||||
|   hspin(1)=fspin(1); | ||||
| } | ||||
|  | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   hspin(0)=fspin(2); | ||||
|   hspin(1)=fspin(3); | ||||
| } | ||||
|    | ||||
| //  template<class vtype> accelerator_inline void fspProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin) | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   rfspin(0)=fspin(0); | ||||
|   rfspin(1)=fspin(1); | ||||
|   rfspin(2)=Zero(); | ||||
|   rfspin(3)=Zero(); | ||||
| } | ||||
| //  template<class vtype> accelerator_inline void fspProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin) | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   rfspin(0)=Zero(); | ||||
|   rfspin(1)=Zero(); | ||||
|   rfspin(2)=fspin(2); | ||||
|   rfspin(3)=fspin(3); | ||||
| } | ||||
|  | ||||
| template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   const int hN = N>>1; | ||||
|   for(int s=0;s<hN;s++){ | ||||
|     rfspin(s)=fspin(s); | ||||
|     rfspin(s+hN)=Zero(); | ||||
|   } | ||||
| } | ||||
| template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   const int hN = N>>1; | ||||
|   for(int s=0;s<hN;s++){ | ||||
|     rfspin(s)=Zero(); | ||||
|     rfspin(s+hN)=fspin(s+hN); | ||||
|   } | ||||
| } | ||||
|  | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Reconstruction routines to move back again to four spin | ||||
| //////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| @@ -192,6 +183,7 @@ template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inlin | ||||
|  */ | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)=hspin(0); | ||||
|   fspin(1)=hspin(1); | ||||
|   fspin(2)=timesMinusI(hspin(1)); | ||||
| @@ -199,6 +191,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)=hspin(0); | ||||
|   fspin(1)=hspin(1); | ||||
|   fspin(2)=timesI(hspin(1)); | ||||
| @@ -206,6 +199,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)+=hspin(0); | ||||
|   fspin(1)+=hspin(1); | ||||
|   fspin(2)-=timesI(hspin(1)); | ||||
| @@ -213,6 +207,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)+=hspin(0); | ||||
|   fspin(1)+=hspin(1); | ||||
|   fspin(2)+=timesI(hspin(1)); | ||||
| @@ -226,6 +221,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a | ||||
|  | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)=hspin(0); | ||||
|   fspin(1)=hspin(1); | ||||
|   fspin(2)= hspin(1); | ||||
| @@ -233,6 +229,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)=hspin(0); | ||||
|   fspin(1)=hspin(1); | ||||
|   fspin(2)=-hspin(1); | ||||
| @@ -240,6 +237,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)+=hspin(0); | ||||
|   fspin(1)+=hspin(1); | ||||
|   fspin(2)+=hspin(1); | ||||
| @@ -247,6 +245,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)+=hspin(0); | ||||
|   fspin(1)+=hspin(1); | ||||
|   fspin(2)-=hspin(1); | ||||
| @@ -261,6 +260,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a | ||||
|  */ | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)=hspin(0); | ||||
|   fspin(1)=hspin(1); | ||||
|   fspin(2)=timesMinusI(hspin(0)); | ||||
| @@ -268,6 +268,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)=hspin(0); | ||||
|   fspin(1)=hspin(1); | ||||
|   fspin(2)=     timesI(hspin(0)); | ||||
| @@ -275,6 +276,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)+=hspin(0); | ||||
|   fspin(1)+=hspin(1); | ||||
|   fspin(2)-=timesI(hspin(0)); | ||||
| @@ -282,6 +284,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)+=hspin(0); | ||||
|   fspin(1)+=hspin(1); | ||||
|   fspin(2)+=timesI(hspin(0)); | ||||
| @@ -295,6 +298,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a | ||||
|  */ | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)=hspin(0); | ||||
|   fspin(1)=hspin(1); | ||||
|   fspin(2)=hspin(0); | ||||
| @@ -302,6 +306,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)=hspin(0); | ||||
|   fspin(1)=hspin(1); | ||||
|   fspin(2)=-hspin(0); | ||||
| @@ -309,6 +314,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)+=hspin(0); | ||||
|   fspin(1)+=hspin(1); | ||||
|   fspin(2)+=hspin(0); | ||||
| @@ -316,6 +322,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)+=hspin(0); | ||||
|   fspin(1)+=hspin(1); | ||||
|   fspin(2)-=hspin(0); | ||||
| @@ -329,6 +336,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a | ||||
|  */ | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul | ||||
|   fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though | ||||
|   fspin(2)=Zero(); | ||||
| @@ -336,6 +344,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5m (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)=Zero(); | ||||
|   fspin(1)=Zero(); | ||||
|   fspin(2)=hspin(0)+hspin(0); | ||||
| @@ -343,6 +352,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s | ||||
| } | ||||
| template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; | ||||
|   fspin(0)+=hspin(0)+hspin(0); | ||||
|   fspin(1)+=hspin(1)+hspin(1); | ||||
| } | ||||
| @@ -362,6 +372,7 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a | ||||
| ////////// | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spProjXp(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| @@ -415,21 +426,26 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconXp (iM | ||||
|     }} | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| //////// | ||||
| // Xm | ||||
| //////// | ||||
| template<class rtype,class vtype> accelerator_inline void spProjXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spProjXm(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spProjXm(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjXm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -439,16 +455,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatri | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void spReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spReconXm(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spReconXm(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconXm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -457,37 +476,45 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatr | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void accumReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   accumReconXm(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     accumReconXm(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconXm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| //////// | ||||
| // Yp | ||||
| //////// | ||||
| template<class rtype,class vtype> accelerator_inline void spProjYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spProjYp(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spProjYp(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjYp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -497,16 +524,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatri | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void spReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spReconYp(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spReconYp(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconYp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -515,55 +545,66 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatr | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void accumReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   accumReconYp(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     accumReconYp(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconYp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| } | ||||
|  | ||||
|  | ||||
| //////// | ||||
| // Ym | ||||
| //////// | ||||
| template<class rtype,class vtype> accelerator_inline void spProjYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spProjYm(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spProjYm(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjYm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| } | ||||
|  | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void spReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spReconYm(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,const iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spReconYm(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconYm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -572,16 +613,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatr | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void accumReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   accumReconYm(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     accumReconYm(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconYm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -594,57 +638,66 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iM | ||||
| //////// | ||||
| template<class rtype,class vtype> accelerator_inline void spProjZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spProjZp(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spProjZp(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjZp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|   }} | ||||
|     }} | ||||
| } | ||||
|  | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void spReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spReconZp(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spReconZp(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconZp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|   }} | ||||
|     }} | ||||
| } | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void accumReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   accumReconZp(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     accumReconZp(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconZp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|   }} | ||||
|     }} | ||||
| } | ||||
|  | ||||
|  | ||||
| @@ -653,53 +706,62 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iM | ||||
| //////// | ||||
| template<class rtype,class vtype> accelerator_inline void spProjZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spProjZm(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spProjZm(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjZm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|   }} | ||||
|     }} | ||||
| } | ||||
|  | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void spReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spReconZm(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spReconZm(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconZm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|   }} | ||||
|     }} | ||||
| } | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void accumReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   accumReconZm(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     accumReconZm(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconZm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -712,35 +774,41 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iM | ||||
| //////// | ||||
| template<class rtype,class vtype> accelerator_inline void spProjTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spProjTp(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spProjTp(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjTp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|   }} | ||||
|     }} | ||||
| } | ||||
|  | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void spReconTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spReconTp(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spReconTp(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconTp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -749,37 +817,44 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatr | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void accumReconTp (iScalar<rtype> &hspin, iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   accumReconTp(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTp (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     accumReconTp(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconTp (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconTp(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| } | ||||
|  | ||||
|  | ||||
| //////// | ||||
| // Tm | ||||
| //////// | ||||
| template<class rtype,class vtype> accelerator_inline void spProjTm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spProjTm(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spProjTm(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProjTm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -789,16 +864,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatri | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void spReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spReconTm(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spReconTm(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spReconTm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -807,37 +885,44 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatr | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void accumReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   accumReconTm(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     accumReconTm(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumReconTm(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|     }} | ||||
| } | ||||
|  | ||||
|  | ||||
| //////// | ||||
| // 5p | ||||
| //////// | ||||
| template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| template<class rtype,class vtype> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spProj5p(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spProj5p(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProj5p(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -846,16 +931,19 @@ template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> acce | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void spRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spRecon5p(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spRecon5p(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spRecon5p(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -864,16 +952,19 @@ template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatr | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void accumRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   accumRecon5p(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     accumRecon5p(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumRecon5p(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -881,18 +972,24 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iM | ||||
| } | ||||
|  | ||||
| // four spinor projectors for chiral proj | ||||
| template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin) | ||||
| //  template<class vtype> accelerator_inline void fspProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin) | ||||
| template<class vtype> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spProj5p(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| //  template<class vtype,int N> accelerator_inline void fspProj5p (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin) | ||||
| template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spProj5p(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| //  template<class vtype,int N> accelerator_inline void fspProj5p (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin) | ||||
| template<class vtype,int N> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProj5p(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -904,17 +1001,17 @@ template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inli | ||||
| // 5m | ||||
| //////// | ||||
|  | ||||
| template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| template<class rtype,class vtype> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   spProj5m(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spProj5m(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
| @@ -924,34 +1021,40 @@ template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> acce | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void spRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spRecon5m(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spRecon5m(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void spRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spRecon5m(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|   }} | ||||
|     }} | ||||
| } | ||||
|  | ||||
| template<class rtype,class vtype> accelerator_inline void accumRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   accumRecon5m(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     accumRecon5m(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       accumRecon5m(hspin._internal[i][j],fspin._internal[i][j]); | ||||
| @@ -960,18 +1063,24 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iM | ||||
|  | ||||
|  | ||||
| // four spinor projectors for chiral proj | ||||
| template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin) | ||||
| //  template<class vtype> accelerator_inline void fspProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin) | ||||
| template<class vtype> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp; | ||||
|   spProj5m(hspin._internal,fspin._internal); | ||||
| } | ||||
| template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| //  template<class vtype,int N> accelerator_inline void fspProj5m (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin) | ||||
| template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++) { | ||||
|     spProj5m(hspin._internal[i],fspin._internal[i]); | ||||
|   } | ||||
| } | ||||
| template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| //  template<class vtype,int N> accelerator_inline void fspProj5m (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin) | ||||
| template<class vtype,int N> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin) | ||||
| { | ||||
|   //typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp; | ||||
|   for(int i=0;i<N;i++){  | ||||
|     for(int j=0;j<N;j++){ | ||||
|       spProj5m(hspin._internal[i][j],fspin._internal[i][j]); | ||||
|   | ||||
| @@ -154,8 +154,8 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co | ||||
|   accelerator_for(sss,nloop,vobj::Nsimd(),{ | ||||
|     uint64_t ss = sss*Ls; | ||||
|     decltype(coalescedRead(y_v[ss+sp])) tmp; | ||||
|     spProj5m(tmp,y_v(ss+sp));  | ||||
|    tmp = a*x_v(ss+s)+b*tmp; | ||||
|     spProj5m(tmp,y_v(ss+sp)); | ||||
|     tmp = a*x_v(ss+s)+b*tmp; | ||||
|     coalescedWrite(z_v[ss+s],tmp); | ||||
|   }); | ||||
| } | ||||
| @@ -188,6 +188,7 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x) | ||||
|   z.Checkerboard() = x.Checkerboard(); | ||||
|   conformable(x,z); | ||||
|   int Ls = grid->_rdimensions[0]; | ||||
|   Gamma G5(Gamma::Algebra::Gamma5); | ||||
|   autoView( x_v, x, AcceleratorRead); | ||||
|   autoView( z_v, z, AcceleratorWrite); | ||||
|   uint64_t nloop = grid->oSites()/Ls; | ||||
| @@ -195,13 +196,7 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x) | ||||
|     uint64_t ss = sss*Ls; | ||||
|     for(int s=0;s<Ls;s++){ | ||||
|       int sp = Ls-1-s; | ||||
|       auto tmp = x_v(ss+s); | ||||
|       decltype(tmp) tmp_p; | ||||
|       decltype(tmp) tmp_m; | ||||
|       spProj5p(tmp_p,tmp); | ||||
|       spProj5m(tmp_m,tmp); | ||||
|       // Use of spProj5m, 5p captures the coarse space too | ||||
|       coalescedWrite(z_v[ss+sp],tmp_p - tmp_m); | ||||
|       coalescedWrite(z_v[ss+sp],G5*x_v(ss+s)); | ||||
|     } | ||||
|   }); | ||||
| } | ||||
| @@ -213,20 +208,10 @@ void G5C(Lattice<vobj> &z, const Lattice<vobj> &x) | ||||
|   z.Checkerboard() = x.Checkerboard(); | ||||
|   conformable(x, z); | ||||
|  | ||||
|   autoView( x_v, x, AcceleratorRead); | ||||
|   autoView( z_v, z, AcceleratorWrite); | ||||
|   uint64_t nloop = grid->oSites(); | ||||
|   accelerator_for(ss,nloop,vobj::Nsimd(),{ | ||||
|     auto tmp = x_v(ss); | ||||
|     decltype(tmp) tmp_p; | ||||
|     decltype(tmp) tmp_m; | ||||
|     spProj5p(tmp_p,tmp); | ||||
|     spProj5m(tmp_m,tmp); | ||||
|     coalescedWrite(z_v[ss],tmp_p - tmp_m); | ||||
|   }); | ||||
|   Gamma G5(Gamma::Algebra::Gamma5); | ||||
|   z = G5 * x; | ||||
| } | ||||
|  | ||||
| /* | ||||
| template<class CComplex, int nbasis> | ||||
| void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, nbasis>> &x) | ||||
| { | ||||
| @@ -249,7 +234,6 @@ void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex, | ||||
|     } | ||||
|   }); | ||||
| } | ||||
| */ | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
|   | ||||
| @@ -41,6 +41,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
| namespace Grid { | ||||
|  | ||||
| #if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) | ||||
| typedef struct { uint16_t x;} half; | ||||
| #endif | ||||
| typedef struct Half2_t { half x; half y; } Half2; | ||||
|  | ||||
| #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH ) | ||||
|  | ||||
| template<class pair> | ||||
| @@ -125,14 +130,14 @@ inline accelerator GpuVector<N,datum> operator/(const GpuVector<N,datum> l,const | ||||
| } | ||||
|  | ||||
| constexpr int NSIMD_RealH    = COALESCE_GRANULARITY / sizeof(half); | ||||
| constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half2); | ||||
| constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(Half2); | ||||
| constexpr int NSIMD_RealF    = COALESCE_GRANULARITY / sizeof(float); | ||||
| constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2); | ||||
| constexpr int NSIMD_RealD    = COALESCE_GRANULARITY / sizeof(double); | ||||
| constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2); | ||||
| constexpr int NSIMD_Integer  = COALESCE_GRANULARITY / sizeof(Integer); | ||||
|  | ||||
| typedef GpuComplex<half2  > GpuComplexH; | ||||
| typedef GpuComplex<Half2  > GpuComplexH; | ||||
| typedef GpuComplex<float2 > GpuComplexF; | ||||
| typedef GpuComplex<double2> GpuComplexD; | ||||
|  | ||||
| @@ -147,11 +152,9 @@ typedef GpuVector<NSIMD_Integer,  Integer     > GpuVectorI; | ||||
| accelerator_inline float half2float(half h) | ||||
| { | ||||
|   float f; | ||||
| #ifdef GRID_SIMT | ||||
| #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||
|   f = __half2float(h); | ||||
| #else  | ||||
|   //f = __half2float(h); | ||||
|   __half_raw hr(h); | ||||
|   Grid_half hh;  | ||||
|   hh.x = hr.x; | ||||
|   f=  sfw_half_to_float(hh); | ||||
| @@ -161,13 +164,11 @@ accelerator_inline float half2float(half h) | ||||
| accelerator_inline half float2half(float f) | ||||
| { | ||||
|   half h; | ||||
| #ifdef GRID_SIMT | ||||
| #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||
|   h = __float2half(f); | ||||
| #else | ||||
|   Grid_half hh = sfw_float_to_half(f); | ||||
|   __half_raw hr;   | ||||
|   hr.x = hh.x; | ||||
|   h = __half(hr); | ||||
|   h.x = hh.x; | ||||
| #endif | ||||
|   return h; | ||||
| } | ||||
| @@ -523,7 +524,7 @@ namespace Optimization { | ||||
|     //////////////////////////////////////////////////////////////////////////////////// | ||||
|     // Single / Half | ||||
|     //////////////////////////////////////////////////////////////////////////////////// | ||||
|     static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) { | ||||
|      static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) { | ||||
|       int N = GpuVectorCF::N; | ||||
|       GpuVectorCH h; | ||||
|       for(int i=0;i<N;i++) { | ||||
|   | ||||
| @@ -55,6 +55,7 @@ void acceleratorInit(void) | ||||
| 	printf("AcceleratorCudaInit[%d]: ========================\n",rank); | ||||
| 	printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name); | ||||
|  | ||||
|  | ||||
| 	GPU_PROP_FMT(totalGlobalMem,"%lld"); | ||||
| 	GPU_PROP(managedMemory); | ||||
| 	GPU_PROP(isMultiGpuBoard); | ||||
| @@ -109,20 +110,24 @@ void acceleratorInit(void) | ||||
|   if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);} | ||||
|   if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} | ||||
|  | ||||
|   printf("world_rank %d has %d devices\n",world_rank,nDevices); | ||||
|   size_t totalDeviceMem=0; | ||||
|   for (int i = 0; i < nDevices; i++) { | ||||
|  | ||||
| #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); | ||||
| #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d"); | ||||
|      | ||||
|     hipGetDeviceProperties(&gpu_props[i], i); | ||||
|     hipDeviceProp_t prop;  | ||||
|     prop = gpu_props[i]; | ||||
|     totalDeviceMem = prop.totalGlobalMem; | ||||
|     if ( world_rank == 0) { | ||||
|       hipDeviceProp_t prop;  | ||||
|       prop = gpu_props[i]; | ||||
|       printf("AcceleratorHipInit: ========================\n"); | ||||
|       printf("AcceleratorHipInit: Device Number    : %d\n", i); | ||||
|       printf("AcceleratorHipInit: ========================\n"); | ||||
|       printf("AcceleratorHipInit: Device identifier: %s\n", prop.name); | ||||
|  | ||||
|       GPU_PROP_FMT(totalGlobalMem,"%lu"); | ||||
|       //      GPU_PROP(managedMemory); | ||||
|       GPU_PROP(isMultiGpuBoard); | ||||
|       GPU_PROP(warpSize); | ||||
| @@ -131,6 +136,7 @@ void acceleratorInit(void) | ||||
|       //      GPU_PROP(singleToDoublePrecisionPerfRatio); | ||||
|     } | ||||
|   } | ||||
|   MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours | ||||
| #undef GPU_PROP_FMT     | ||||
| #undef GPU_PROP | ||||
| #ifdef GRID_IBM_SUMMIT | ||||
|   | ||||
| @@ -151,9 +151,6 @@ inline void *acceleratorAllocShared(size_t bytes) | ||||
|     ptr = (void *) NULL; | ||||
|     printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err)); | ||||
|   } | ||||
|   //  size_t free,total; | ||||
|   //  cudaMemGetInfo(&free,&total); | ||||
|   //  std::cout << "Malloc managed "<<bytes<<" "<<free<<"/"<<total<<std::endl; | ||||
|   return ptr; | ||||
| }; | ||||
| inline void *acceleratorAllocDevice(size_t bytes) | ||||
| @@ -164,9 +161,6 @@ inline void *acceleratorAllocDevice(size_t bytes) | ||||
|     ptr = (void *) NULL; | ||||
|     printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); | ||||
|   } | ||||
|   //  size_t free,total; | ||||
|   //  cudaMemGetInfo(&free,&total); | ||||
|   //  std::cout << "Malloc device "<<bytes<<" "<<free<<"/"<<total<<std::endl; | ||||
|   return ptr; | ||||
| }; | ||||
| inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; | ||||
| @@ -313,17 +307,13 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) | ||||
|  | ||||
| inline void *acceleratorAllocShared(size_t bytes) | ||||
| { | ||||
| #if 0 | ||||
|   void *ptr=NULL; | ||||
|   auto err = hipMallocManaged((void **)&ptr,bytes); | ||||
|   if( err != hipSuccess ) { | ||||
|     ptr = (void *) NULL; | ||||
|     printf(" hipMallocManaged failed for %d %s \n",bytes,hipGetErrorString(err)); | ||||
|     printf(" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); | ||||
|   } | ||||
|   return ptr; | ||||
| #else | ||||
|   return malloc(bytes); | ||||
| #endif | ||||
| }; | ||||
| inline int  acceleratorIsCommunicable(void *ptr){ return 1; } | ||||
|  | ||||
| @@ -333,7 +323,7 @@ inline void *acceleratorAllocDevice(size_t bytes) | ||||
|   auto err = hipMalloc((void **)&ptr,bytes); | ||||
|   if( err != hipSuccess ) { | ||||
|     ptr = (void *) NULL; | ||||
|     printf(" hipMalloc failed for %d %s \n",bytes,hipGetErrorString(err)); | ||||
|     printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); | ||||
|   } | ||||
|   return ptr; | ||||
| }; | ||||
|   | ||||
							
								
								
									
										33
									
								
								README
									
									
									
									
									
								
							
							
						
						
									
										33
									
								
								README
									
									
									
									
									
								
							| @@ -111,11 +111,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu | ||||
|  | ||||
| ``` bash | ||||
| mkdir build; cd build | ||||
| ../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path> | ||||
| ../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path> | ||||
| ``` | ||||
|  | ||||
| where `--enable-precision=` set the default precision, | ||||
| `--enable-simd=` set the SIMD type, `--enable- | ||||
| where `--enable-simd=` set the SIMD type, `--enable- | ||||
| comms=`, and `<path>` should be replaced by the prefix path where you want to | ||||
| install Grid. Other options are detailed in the next section, you can also use `configure | ||||
| --help` to display them. Like with any other program using GNU autotool, the | ||||
| @@ -146,8 +145,8 @@ If you want to build all the tests at once just use `make tests`. | ||||
| - `--enable-numa`: enable NUMA first touch optimisation | ||||
| - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. | ||||
| - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). | ||||
| - `--enable-precision={single|double}`: set the default precision (default: `double`). | ||||
| - `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. | ||||
| - `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option** | ||||
| - `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. | ||||
| - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `). | ||||
| - `--disable-timers`: disable system dependent high-resolution timers. | ||||
| - `--enable-chroma`: enable Chroma regression tests. | ||||
| @@ -201,8 +200,7 @@ Alternatively, some CPU codenames can be directly used: | ||||
| The following configuration is recommended for the Intel Knights Landing platform: | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=KNL        \ | ||||
| ../configure --enable-simd=KNL        \ | ||||
|              --enable-comms=mpi-auto  \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=icpc MPICXX=mpiicpc | ||||
| @@ -212,8 +210,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. | ||||
| If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=KNL        \ | ||||
| ../configure --enable-simd=KNL        \ | ||||
|              --enable-comms=mpi       \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=CC CC=cc | ||||
| @@ -232,8 +229,7 @@ for interior communication. This is the mpi3 communications implementation. | ||||
| We recommend four ranks per node for best performance, but optimum is local volume dependent. | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=KNL        \ | ||||
| ../configure --enable-simd=KNL        \ | ||||
|              --enable-comms=mpi3-auto \ | ||||
|              --enable-mkl             \ | ||||
|              CC=icpc MPICXX=mpiicpc  | ||||
| @@ -244,8 +240,7 @@ We recommend four ranks per node for best performance, but optimum is local volu | ||||
| The following configuration is recommended for the Intel Haswell platform: | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX2       \ | ||||
| ../configure --enable-simd=AVX2       \ | ||||
|              --enable-comms=mpi3-auto \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=icpc MPICXX=mpiicpc | ||||
| @@ -262,8 +257,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed. | ||||
| If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX2       \ | ||||
| ../configure --enable-simd=AVX2       \ | ||||
|              --enable-comms=mpi3      \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=CC CC=cc | ||||
| @@ -280,8 +274,7 @@ This is the default. | ||||
| The following configuration is recommended for the Intel Skylake platform: | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX512     \ | ||||
| ../configure --enable-simd=AVX512     \ | ||||
|              --enable-comms=mpi3      \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=mpiicpc | ||||
| @@ -298,8 +291,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed. | ||||
| If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX512     \ | ||||
| ../configure --enable-simd=AVX512     \ | ||||
|              --enable-comms=mpi3      \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=CC CC=cc | ||||
| @@ -330,8 +322,7 @@ and 8 threads per rank. | ||||
| The following configuration is recommended for the AMD EPYC platform. | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX2       \ | ||||
| ../configure --enable-simd=AVX2       \ | ||||
|              --enable-comms=mpi3 \ | ||||
|              CXX=mpicxx  | ||||
| ``` | ||||
|   | ||||
							
								
								
									
										33
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										33
									
								
								README.md
									
									
									
									
									
								
							| @@ -115,11 +115,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu | ||||
|  | ||||
| ``` bash | ||||
| mkdir build; cd build | ||||
| ../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path> | ||||
| ../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path> | ||||
| ``` | ||||
|  | ||||
| where `--enable-precision=` set the default precision, | ||||
| `--enable-simd=` set the SIMD type, `--enable- | ||||
| where `--enable-simd=` set the SIMD type, `--enable- | ||||
| comms=`, and `<path>` should be replaced by the prefix path where you want to | ||||
| install Grid. Other options are detailed in the next section, you can also use `configure | ||||
| --help` to display them. Like with any other program using GNU autotool, the | ||||
| @@ -150,8 +149,8 @@ If you want to build all the tests at once just use `make tests`. | ||||
| - `--enable-numa`: enable NUMA first touch optimisation | ||||
| - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. | ||||
| - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). | ||||
| - `--enable-precision={single|double}`: set the default precision (default: `double`). | ||||
| - `--enable-precision=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. | ||||
| - `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option** | ||||
| - `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. | ||||
| - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `). | ||||
| - `--disable-timers`: disable system dependent high-resolution timers. | ||||
| - `--enable-chroma`: enable Chroma regression tests. | ||||
| @@ -205,8 +204,7 @@ Alternatively, some CPU codenames can be directly used: | ||||
| The following configuration is recommended for the Intel Knights Landing platform: | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=KNL        \ | ||||
| ../configure --enable-simd=KNL        \ | ||||
|              --enable-comms=mpi-auto  \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=icpc MPICXX=mpiicpc | ||||
| @@ -216,8 +214,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. | ||||
| If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=KNL        \ | ||||
| ../configure --enable-simd=KNL        \ | ||||
|              --enable-comms=mpi       \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=CC CC=cc | ||||
| @@ -236,8 +233,7 @@ for interior communication. This is the mpi3 communications implementation. | ||||
| We recommend four ranks per node for best performance, but optimum is local volume dependent. | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=KNL        \ | ||||
| ../configure --enable-simd=KNL        \ | ||||
|              --enable-comms=mpi3-auto \ | ||||
|              --enable-mkl             \ | ||||
|              CC=icpc MPICXX=mpiicpc  | ||||
| @@ -248,8 +244,7 @@ We recommend four ranks per node for best performance, but optimum is local volu | ||||
| The following configuration is recommended for the Intel Haswell platform: | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX2       \ | ||||
| ../configure --enable-simd=AVX2       \ | ||||
|              --enable-comms=mpi3-auto \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=icpc MPICXX=mpiicpc | ||||
| @@ -266,8 +261,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed. | ||||
| If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX2       \ | ||||
| ../configure --enable-simd=AVX2       \ | ||||
|              --enable-comms=mpi3      \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=CC CC=cc | ||||
| @@ -284,8 +278,7 @@ This is the default. | ||||
| The following configuration is recommended for the Intel Skylake platform: | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX512     \ | ||||
| ../configure --enable-simd=AVX512     \ | ||||
|              --enable-comms=mpi3      \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=mpiicpc | ||||
| @@ -302,8 +295,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed. | ||||
| If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX512     \ | ||||
| ../configure --enable-simd=AVX512     \ | ||||
|              --enable-comms=mpi3      \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=CC CC=cc | ||||
| @@ -334,8 +326,7 @@ and 8 threads per rank. | ||||
| The following configuration is recommended for the AMD EPYC platform. | ||||
|  | ||||
| ``` bash | ||||
| ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX2       \ | ||||
| ../configure --enable-simd=AVX2       \ | ||||
|              --enable-comms=mpi3 \ | ||||
|              CXX=mpicxx  | ||||
| ``` | ||||
|   | ||||
| @@ -12,31 +12,31 @@ module load mpi/openmpi-aarch64 | ||||
|  | ||||
| scl enable gcc-toolset-10 bash | ||||
|  | ||||
| ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" | ||||
| ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" | ||||
|  | ||||
| * gcc 10.1 prebuild w/ MPI, QPACE4 interactive login | ||||
|  | ||||
| scl enable gcc-toolset-10 bash | ||||
| module load mpi/openmpi-aarch64 | ||||
|  | ||||
| ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" | ||||
| ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" | ||||
|  | ||||
| ------------------------------------------------------------------------------ | ||||
|  | ||||
| * armclang 20.2 (qp4) | ||||
|  | ||||
| ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN" | ||||
| ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN" | ||||
|  | ||||
| ------------------------------------------------------------------------------ | ||||
|  | ||||
| * gcc 10.0.1 VLA (merlin) | ||||
|  | ||||
| ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static | ||||
| ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static | ||||
|  | ||||
|  | ||||
| * gcc 10.0.1 fixed-size ACLE (merlin) | ||||
|  | ||||
| ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" | ||||
| ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" | ||||
|  | ||||
|  | ||||
| * gcc 10.0.1 fixed-size ACLE (fjt) w/ MPI | ||||
| @@ -46,34 +46,34 @@ export OMPI_CXX=g++-10.0.1 | ||||
| export MPICH_CC=gcc-10.0.1 | ||||
| export MPICH_CXX=g++-10.0.1 | ||||
|  | ||||
| $ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt" | ||||
| $ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt" | ||||
|  | ||||
| -------------------------------------------------------- | ||||
|  | ||||
| * armclang 20.0 VLA (merlin) | ||||
|  | ||||
| ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static | ||||
| ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static | ||||
|  | ||||
| TODO check ARMCLANGCOMPAT | ||||
|  | ||||
|  | ||||
| * armclang 20.1 VLA (merlin) | ||||
|  | ||||
| ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static | ||||
| ../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static | ||||
|  | ||||
| TODO check ARMCLANGCOMPAT | ||||
|  | ||||
|  | ||||
| * armclang 20.1 VLA (fjt cluster) | ||||
|  | ||||
| ../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" | ||||
| ../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" | ||||
|  | ||||
| TODO check ARMCLANGCOMPAT | ||||
|  | ||||
|  | ||||
| * armclang 20.1 VLA w/MPI (fjt cluster) | ||||
|  | ||||
| ../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64" | ||||
| ../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64" | ||||
|  | ||||
| No ARMCLANGCOMPAT -> still correct ? | ||||
|  | ||||
| @@ -81,9 +81,9 @@ No ARMCLANGCOMPAT -> still correct ? | ||||
|  | ||||
| * Fujitsu fcc | ||||
|  | ||||
| ../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" | ||||
| ../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" | ||||
|  | ||||
|  | ||||
| * Fujitsu fcc w/ MPI | ||||
|  | ||||
| ../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" | ||||
| ../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" | ||||
|   | ||||
| @@ -1,8 +1,16 @@ | ||||
|  | ||||
| #include "Benchmark_IO.hpp" | ||||
|  | ||||
| #ifndef BENCH_IO_LMIN | ||||
| #define BENCH_IO_LMIN 8 | ||||
| #endif | ||||
|  | ||||
| #ifndef BENCH_IO_LMAX | ||||
| #define BENCH_IO_LMAX 40 | ||||
| #define BENCH_IO_LMAX 32 | ||||
| #endif | ||||
|  | ||||
| #ifndef BENCH_IO_NPASS | ||||
| #define BENCH_IO_NPASS 10 | ||||
| #endif | ||||
|  | ||||
| using namespace Grid; | ||||
| @@ -12,37 +20,179 @@ std::string filestem(const int l) | ||||
|   return "iobench_l" + std::to_string(l); | ||||
| } | ||||
|  | ||||
| int vol(const int i) | ||||
| { | ||||
|   return BENCH_IO_LMIN + 2*i; | ||||
| } | ||||
|  | ||||
| int volInd(const int l) | ||||
| { | ||||
|   return (l - BENCH_IO_LMIN)/2; | ||||
| } | ||||
|  | ||||
| template <typename Mat> | ||||
| void stats(Mat &mean, Mat &stdDev, const std::vector<Mat> &data) | ||||
| { | ||||
|   auto            nr = data[0].rows(), nc = data[0].cols(); | ||||
|   Eigen::MatrixXd sqSum(nr, nc); | ||||
|   double          n = static_cast<double>(data.size()); | ||||
|  | ||||
|   assert(n > 1.); | ||||
|   mean  = Mat::Zero(nr, nc); | ||||
|   sqSum = Mat::Zero(nr, nc); | ||||
|   for (auto &d: data) | ||||
|   { | ||||
|     mean  += d; | ||||
|     sqSum += d.cwiseProduct(d); | ||||
|   } | ||||
|   stdDev = ((sqSum - mean.cwiseProduct(mean)/n)/(n - 1.)).cwiseSqrt(); | ||||
|   mean  /= n; | ||||
| } | ||||
|  | ||||
| #define grid_printf(...) \ | ||||
| {\ | ||||
|   char _buf[1024];\ | ||||
|   sprintf(_buf, __VA_ARGS__);\ | ||||
|   MSG << _buf;\ | ||||
| } | ||||
|  | ||||
| enum {sRead = 0, sWrite = 1, gRead = 2, gWrite = 3}; | ||||
|  | ||||
| int main (int argc, char ** argv) | ||||
| { | ||||
| #ifdef HAVE_LIME | ||||
|   Grid_init(&argc,&argv); | ||||
|  | ||||
|   int64_t threads = GridThread::GetThreads(); | ||||
|   int64_t                      threads = GridThread::GetThreads(); | ||||
|   auto                         mpi     = GridDefaultMpi(); | ||||
|   unsigned int                 nVol    = (BENCH_IO_LMAX - BENCH_IO_LMIN)/2 + 1; | ||||
|   unsigned int                 nRelVol = (BENCH_IO_LMAX - 24)/2 + 1; | ||||
|   std::vector<Eigen::MatrixXd> perf(BENCH_IO_NPASS, Eigen::MatrixXd::Zero(nVol, 4)); | ||||
|   std::vector<Eigen::VectorXd> avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4)); | ||||
|   std::vector<int>             latt; | ||||
|  | ||||
|   MSG << "Grid is setup to use " << threads << " threads" << std::endl; | ||||
|   MSG << SEP << std::endl; | ||||
|   MSG << "Benchmark Lime write" << std::endl; | ||||
|   MSG << SEP << std::endl; | ||||
|   for (int l = 4; l <= BENCH_IO_LMAX; l += 2) | ||||
|   MSG << "MPI partition " << mpi << std::endl; | ||||
|   for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i) | ||||
|   { | ||||
|     auto             mpi  = GridDefaultMpi(); | ||||
|     std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; | ||||
|     MSG << BIGSEP << std::endl; | ||||
|     MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl; | ||||
|     MSG << BIGSEP << std::endl; | ||||
|     MSG << SEP << std::endl; | ||||
|     MSG << "Benchmark std write" << std::endl; | ||||
|     MSG << SEP << std::endl; | ||||
|     for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) | ||||
|     { | ||||
|       latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; | ||||
|  | ||||
|     std::cout << "-- Local volume " << l << "^4" << std::endl; | ||||
|     writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>); | ||||
|       MSG << "-- Local volume " << l << "^4" << std::endl; | ||||
|       writeBenchmark<LatticeFermion>(latt, filestem(l), stdWrite<LatticeFermion>); | ||||
|       perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond; | ||||
|     } | ||||
|  | ||||
|     MSG << SEP << std::endl; | ||||
|     MSG << "Benchmark std read" << std::endl; | ||||
|     MSG << SEP << std::endl; | ||||
|     for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) | ||||
|     { | ||||
|       latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; | ||||
|  | ||||
|       MSG << "-- Local volume " << l << "^4" << std::endl; | ||||
|       readBenchmark<LatticeFermion>(latt, filestem(l), stdRead<LatticeFermion>); | ||||
|       perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond; | ||||
|     } | ||||
|  | ||||
|   #ifdef HAVE_LIME | ||||
|     MSG << SEP << std::endl; | ||||
|     MSG << "Benchmark Grid C-Lime write" << std::endl; | ||||
|     MSG << SEP << std::endl; | ||||
|     for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) | ||||
|     { | ||||
|       latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; | ||||
|  | ||||
|       MSG << "-- Local volume " << l << "^4" << std::endl; | ||||
|       writeBenchmark<LatticeFermion>(latt, filestem(l), limeWrite<LatticeFermion>); | ||||
|       perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond; | ||||
|     } | ||||
|  | ||||
|     MSG << SEP << std::endl; | ||||
|     MSG << "Benchmark Grid C-Lime read" << std::endl; | ||||
|     MSG << SEP << std::endl; | ||||
|     for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) | ||||
|     { | ||||
|       latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; | ||||
|  | ||||
|       MSG << "-- Local volume " << l << "^4" << std::endl; | ||||
|       readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>); | ||||
|       perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond; | ||||
|     } | ||||
| #endif | ||||
|     avPerf[i].fill(0.); | ||||
|     for (int f = 0; f < 4; ++f) | ||||
|     for (int l = 24; l <= BENCH_IO_LMAX; l += 2) | ||||
|     { | ||||
|       avPerf[i](f) += perf[i](volInd(l), f); | ||||
|     } | ||||
|     avPerf[i] /= nRelVol; | ||||
|   } | ||||
|  | ||||
|   MSG << "Benchmark Lime read" << std::endl; | ||||
|   MSG << SEP << std::endl; | ||||
|   for (int l = 4; l <= BENCH_IO_LMAX; l += 2) | ||||
|   { | ||||
|     auto             mpi  = GridDefaultMpi(); | ||||
|     std::vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; | ||||
|   Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4); | ||||
|   Eigen::VectorXd avMean(4), avStdDev(4), avRob(4); | ||||
|   double          n = BENCH_IO_NPASS; | ||||
|  | ||||
|     std::cout << "-- Local volume " << l << "^4" << std::endl; | ||||
|     readBenchmark<LatticeFermion>(latt, filestem(l), limeRead<LatticeFermion>); | ||||
|   stats(mean, stdDev, perf); | ||||
|   stats(avMean, avStdDev, avPerf); | ||||
|   rob.fill(100.); | ||||
|   rob -= 100.*stdDev.cwiseQuotient(mean.cwiseAbs()); | ||||
|   avRob.fill(100.); | ||||
|   avRob -= 100.*avStdDev.cwiseQuotient(avMean.cwiseAbs()); | ||||
|  | ||||
|   MSG << BIGSEP << std::endl; | ||||
|   MSG << "SUMMARY" << std::endl; | ||||
|   MSG << BIGSEP << std::endl; | ||||
|   MSG << "Summary of individual results (all results in MB/s)." << std::endl; | ||||
|   MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; | ||||
|   MSG << std::endl; | ||||
|   grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", | ||||
|               "L", "std read", "std dev", "std write", "std dev", | ||||
|               "Grid read", "std dev", "Grid write", "std dev"); | ||||
|   for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) | ||||
|   { | ||||
|     grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", | ||||
|                 l, mean(volInd(l), sRead), stdDev(volInd(l), sRead), | ||||
|                 mean(volInd(l), sWrite), stdDev(volInd(l), sWrite), | ||||
|                 mean(volInd(l), gRead), stdDev(volInd(l), gRead), | ||||
|                 mean(volInd(l), gWrite), stdDev(volInd(l), gWrite)); | ||||
|   } | ||||
|   MSG << std::endl; | ||||
|   MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl; | ||||
|   MSG << std::endl; | ||||
|   grid_printf("%4s %12s %12s %12s %12s\n", | ||||
|               "L", "std read", "std write", "Grid read", "Grid write"); | ||||
|   for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) | ||||
|   { | ||||
|     grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", | ||||
|                 l, rob(volInd(l), sRead), rob(volInd(l), sWrite), | ||||
|                 rob(volInd(l), gRead), rob(volInd(l), gWrite)); | ||||
|   } | ||||
|   MSG << std::endl; | ||||
|   MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl; | ||||
|   MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; | ||||
|   MSG << std::endl; | ||||
|   grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", | ||||
|               "std read", "std dev", "std write", "std dev", | ||||
|               "Grid read", "std dev", "Grid write", "std dev"); | ||||
|   grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", | ||||
|               avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), | ||||
|               avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite)); | ||||
|   MSG << std::endl; | ||||
|   MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl; | ||||
|   MSG << std::endl; | ||||
|   grid_printf("%12s %12s %12s %12s\n", | ||||
|               "std read", "std write", "Grid read", "Grid write"); | ||||
|   grid_printf("%12.1f %12.1f %12.1f %12.1f\n", | ||||
|               avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite)); | ||||
|  | ||||
|   Grid_finalize(); | ||||
| #endif | ||||
|  | ||||
|   return EXIT_SUCCESS; | ||||
| } | ||||
|   | ||||
| @@ -5,6 +5,8 @@ | ||||
| #ifdef HAVE_LIME | ||||
| #define MSG std::cout << GridLogMessage | ||||
| #define SEP \ | ||||
| "-----------------------------------------------------------------------------" | ||||
| #define BIGSEP \ | ||||
| "=============================================================================" | ||||
|  | ||||
| namespace Grid { | ||||
| @@ -14,13 +16,152 @@ using WriterFn = std::function<void(const std::string, Field &)> ; | ||||
| template <typename Field> | ||||
| using ReaderFn = std::function<void(Field &, const std::string)>; | ||||
|  | ||||
| // AP 06/10/2020: Standard C version in case one is suspicious of the C++ API | ||||
| //  | ||||
| // template <typename Field> | ||||
| // void stdWrite(const std::string filestem, Field &vec) | ||||
| // { | ||||
| //   std::string   rankStr = std::to_string(vec.Grid()->ThisRank()); | ||||
| //   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb"); | ||||
| //   size_t        size; | ||||
| //   uint32_t      crc; | ||||
| //   GridStopWatch ioWatch, crcWatch; | ||||
|  | ||||
| //   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); | ||||
| //   autoView(vec_v, vec, CpuRead); | ||||
| //   crcWatch.Start(); | ||||
| //   crc = GridChecksum::crc32(vec_v.cpu_ptr, size); | ||||
| //   std::fwrite(&crc, sizeof(uint32_t), 1, file); | ||||
| //   crcWatch.Stop(); | ||||
| //   MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl; | ||||
| //   ioWatch.Start(); | ||||
| //   std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); | ||||
| //   ioWatch.Stop(); | ||||
| //   std::fclose(file); | ||||
| //   size *= vec.Grid()->ProcessorCount(); | ||||
| //   auto &p = BinaryIO::lastPerf; | ||||
| //   p.size            = size; | ||||
| //   p.time            = ioWatch.useconds(); | ||||
| //   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); | ||||
| //   MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()  | ||||
| //       << ", " << p.mbytesPerSecond << " MB/s" << std::endl; | ||||
| //   MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; | ||||
| // } | ||||
| // | ||||
| // template <typename Field> | ||||
| // void stdRead(Field &vec, const std::string filestem) | ||||
| // { | ||||
| //   std::string   rankStr = std::to_string(vec.Grid()->ThisRank()); | ||||
| //   std::FILE     *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb"); | ||||
| //   size_t        size; | ||||
| //   uint32_t      crcRead, crcData; | ||||
| //   GridStopWatch ioWatch, crcWatch; | ||||
|  | ||||
| //   size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); | ||||
| //   crcWatch.Start(); | ||||
| //   std::fread(&crcRead, sizeof(uint32_t), 1, file); | ||||
| //   crcWatch.Stop(); | ||||
| //   { | ||||
| //     autoView(vec_v, vec, CpuWrite); | ||||
| //     ioWatch.Start(); | ||||
| //     std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); | ||||
| //     ioWatch.Stop(); | ||||
| //     std::fclose(file); | ||||
| //   } | ||||
| //   { | ||||
| //     autoView(vec_v, vec, CpuRead); | ||||
| //     crcWatch.Start(); | ||||
| //     crcData = GridChecksum::crc32(vec_v.cpu_ptr, size); | ||||
| //     crcWatch.Stop(); | ||||
| //   } | ||||
| //   MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl; | ||||
| //   assert(crcData == crcRead); | ||||
| //   size *= vec.Grid()->ProcessorCount(); | ||||
| //   auto &p = BinaryIO::lastPerf; | ||||
| //   p.size            = size; | ||||
| //   p.time            = ioWatch.useconds(); | ||||
| //   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); | ||||
| //   MSG << "Std I/O read: Read " <<  p.size << " bytes in " << ioWatch.Elapsed()  | ||||
| //       << ", " << p.mbytesPerSecond << " MB/s" << std::endl; | ||||
| //   MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; | ||||
| // } | ||||
|  | ||||
| template <typename Field> | ||||
| void stdWrite(const std::string filestem, Field &vec) | ||||
| { | ||||
|   std::string   rankStr = std::to_string(vec.Grid()->ThisRank()); | ||||
|   std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary); | ||||
|   size_t        size, sizec; | ||||
|   uint32_t      crc; | ||||
|   GridStopWatch ioWatch, crcWatch; | ||||
|  | ||||
|   size  = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); | ||||
|   sizec = size/sizeof(char); // just in case of... | ||||
|   autoView(vec_v, vec, CpuRead); | ||||
|   crcWatch.Start(); | ||||
|   crc = GridChecksum::crc32(vec_v.cpu_ptr, size); | ||||
|   file.write(reinterpret_cast<char *>(&crc), sizeof(uint32_t)/sizeof(char)); | ||||
|   crcWatch.Stop(); | ||||
|   MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl; | ||||
|   ioWatch.Start(); | ||||
|   file.write(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec); | ||||
|   file.flush(); | ||||
|   ioWatch.Stop(); | ||||
|   size *= vec.Grid()->ProcessorCount(); | ||||
|   auto &p = BinaryIO::lastPerf; | ||||
|   p.size            = size; | ||||
|   p.time            = ioWatch.useconds(); | ||||
|   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); | ||||
|   MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed()  | ||||
|       << ", " << p.mbytesPerSecond << " MB/s" << std::endl; | ||||
|   MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; | ||||
| } | ||||
|  | ||||
| template <typename Field> | ||||
| void stdRead(Field &vec, const std::string filestem) | ||||
| { | ||||
|   std::string   rankStr = std::to_string(vec.Grid()->ThisRank()); | ||||
|   std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary); | ||||
|   size_t        size, sizec; | ||||
|   uint32_t      crcRead, crcData; | ||||
|   GridStopWatch ioWatch, crcWatch; | ||||
|  | ||||
|   size  = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); | ||||
|   sizec = size/sizeof(char); // just in case of... | ||||
|   crcWatch.Start(); | ||||
|   file.read(reinterpret_cast<char *>(&crcRead), sizeof(uint32_t)/sizeof(char)); | ||||
|   crcWatch.Stop(); | ||||
|   { | ||||
|     autoView(vec_v, vec, CpuWrite); | ||||
|     ioWatch.Start(); | ||||
|     file.read(reinterpret_cast<char *>(vec_v.cpu_ptr), sizec); | ||||
|     ioWatch.Stop(); | ||||
|   } | ||||
|   { | ||||
|     autoView(vec_v, vec, CpuRead); | ||||
|     crcWatch.Start(); | ||||
|     crcData = GridChecksum::crc32(vec_v.cpu_ptr, size); | ||||
|     crcWatch.Stop(); | ||||
|   } | ||||
|   MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl; | ||||
|   assert(crcData == crcRead); | ||||
|   size *= vec.Grid()->ProcessorCount(); | ||||
|   auto &p = BinaryIO::lastPerf; | ||||
|   p.size            = size; | ||||
|   p.time            = ioWatch.useconds(); | ||||
|   p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); | ||||
|   MSG << "Std I/O read: Read " <<  p.size << " bytes in " << ioWatch.Elapsed()  | ||||
|       << ", " << p.mbytesPerSecond << " MB/s" << std::endl; | ||||
|   MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; | ||||
| } | ||||
|  | ||||
| template <typename Field> | ||||
| void limeWrite(const std::string filestem, Field &vec) | ||||
| { | ||||
|   emptyUserRecord   record; | ||||
|   ScidacWriter binWriter(vec.Grid()->IsBoss()); | ||||
|  | ||||
|   binWriter.open(filestem + ".bin"); | ||||
|   binWriter.open(filestem + ".lime.bin"); | ||||
|   binWriter.writeScidacFieldRecord(vec, record); | ||||
|   binWriter.close(); | ||||
| } | ||||
| @@ -31,7 +172,7 @@ void limeRead(Field &vec, const std::string filestem) | ||||
|   emptyUserRecord   record; | ||||
|   ScidacReader binReader; | ||||
|  | ||||
|   binReader.open(filestem + ".bin"); | ||||
|   binReader.open(filestem + ".lime.bin"); | ||||
|   binReader.readScidacFieldRecord(vec, record); | ||||
|   binReader.close(); | ||||
| } | ||||
| @@ -73,12 +214,18 @@ void writeBenchmark(const Coordinate &latt, const std::string filename, | ||||
|   auto                           simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); | ||||
|   std::shared_ptr<GridCartesian> gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); | ||||
|   std::shared_ptr<GridBase>      gPt; | ||||
|   std::random_device             rd; | ||||
|  | ||||
|   makeGrid(gPt, gBasePt, Ls, rb); | ||||
|  | ||||
|   GridBase                       *g = gPt.get(); | ||||
|   GridParallelRNG                rng(g); | ||||
|   Field                          vec(g); | ||||
|   GridBase         *g = gPt.get(); | ||||
|   GridParallelRNG  rng(g); | ||||
|   Field            vec(g); | ||||
|  | ||||
|   rng.SeedFixedIntegers({static_cast<int>(rd()), static_cast<int>(rd()), | ||||
|                          static_cast<int>(rd()), static_cast<int>(rd()), | ||||
|                          static_cast<int>(rd()), static_cast<int>(rd()), | ||||
|                          static_cast<int>(rd()), static_cast<int>(rd())}); | ||||
|  | ||||
|   random(rng, vec); | ||||
|   write(filename, vec); | ||||
| @@ -96,8 +243,8 @@ void readBenchmark(const Coordinate &latt, const std::string filename, | ||||
|  | ||||
|   makeGrid(gPt, gBasePt, Ls, rb); | ||||
|  | ||||
|   GridBase                       *g = gPt.get(); | ||||
|   Field                          vec(g); | ||||
|   GridBase *g = gPt.get(); | ||||
|   Field    vec(g); | ||||
|  | ||||
|   read(vec, filename); | ||||
| } | ||||
|   | ||||
| @@ -1,14 +1,9 @@ | ||||
| #include "Benchmark_IO.hpp" | ||||
|  | ||||
| #define MSG std::cout << GridLogMessage | ||||
| #define SEP \ | ||||
| "=============================================================================" | ||||
|  | ||||
| using namespace Grid; | ||||
|  | ||||
| int main (int argc, char ** argv) | ||||
| { | ||||
| #ifdef HAVE_LIME | ||||
|   std::vector<std::string> dir; | ||||
|   unsigned int             Ls; | ||||
|   bool                     rb; | ||||
| @@ -34,46 +29,71 @@ int main (int argc, char ** argv) | ||||
|   } | ||||
|   Grid_init(&argc,&argv); | ||||
|  | ||||
|  | ||||
|   int64_t threads = GridThread::GetThreads(); | ||||
|   auto    mpi     = GridDefaultMpi(); | ||||
|  | ||||
|   MSG << "Grid is setup to use " << threads << " threads" << std::endl; | ||||
|   MSG << SEP << std::endl; | ||||
|   MSG << "Benchmark double precision Lime write" << std::endl; | ||||
|   MSG << SEP << std::endl; | ||||
|   for (auto &d: dir) | ||||
|   { | ||||
|     MSG << "-- Directory " << d << std::endl; | ||||
|     writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermion>, Ls, rb); | ||||
|   } | ||||
|   MSG << "MPI partition " << mpi << std::endl; | ||||
|  | ||||
|   MSG << SEP << std::endl; | ||||
|   MSG << "Benchmark double precision Lime read" << std::endl; | ||||
|   MSG << "Benchmark Grid std write" << std::endl; | ||||
|   MSG << SEP << std::endl; | ||||
|   for (auto &d: dir) | ||||
|   { | ||||
|     MSG << "-- Directory " << d << std::endl; | ||||
|     readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermion>, Ls, rb); | ||||
|     writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench",  | ||||
|                                    stdWrite<LatticeFermion>, Ls, rb); | ||||
|   } | ||||
|   MSG << SEP << std::endl; | ||||
|   MSG << "Benchmark Grid std read" << std::endl; | ||||
|   MSG << SEP << std::endl; | ||||
|   for (auto &d: dir) | ||||
|   { | ||||
|     MSG << "-- Directory " << d << std::endl; | ||||
|     readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench",  | ||||
|                                   stdRead<LatticeFermion>, Ls, rb); | ||||
|   } | ||||
|  | ||||
| #ifdef HAVE_LIME | ||||
|   MSG << SEP << std::endl; | ||||
|   MSG << "Benchmark single precision Lime write" << std::endl; | ||||
|   MSG << "Benchmark Grid C-Lime write" << std::endl; | ||||
|   MSG << SEP << std::endl; | ||||
|   for (auto &d: dir) | ||||
|   { | ||||
|     MSG << "-- Directory " << d << std::endl; | ||||
|     writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb); | ||||
|     writeBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench",  | ||||
|                                    limeWrite<LatticeFermion>, Ls, rb); | ||||
|   } | ||||
|   MSG << SEP << std::endl; | ||||
|   MSG << "Benchmark Grid C-Lime read" << std::endl; | ||||
|   MSG << SEP << std::endl; | ||||
|   for (auto &d: dir) | ||||
|   { | ||||
|     MSG << "-- Directory " << d << std::endl; | ||||
|     readBenchmark<LatticeFermion>(GridDefaultLatt(), d + "/ioBench",  | ||||
|                                   limeRead<LatticeFermion>, Ls, rb); | ||||
|   } | ||||
| #endif | ||||
|  | ||||
|   MSG << SEP << std::endl; | ||||
|   MSG << "Benchmark single precision Lime read" << std::endl; | ||||
|   MSG << SEP << std::endl; | ||||
|   for (auto &d: dir) | ||||
|   { | ||||
|     MSG << "-- Directory " << d << std::endl; | ||||
|     readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb); | ||||
|   } | ||||
|   // MSG << SEP << std::endl; | ||||
|   // MSG << "Benchmark single precision Lime write" << std::endl; | ||||
|   // MSG << SEP << std::endl; | ||||
|   // for (auto &d: dir) | ||||
|   // { | ||||
|   //   MSG << "-- Directory " << d << std::endl; | ||||
|   //   writeBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeWrite<LatticeFermionF>, Ls, rb); | ||||
|   // } | ||||
|  | ||||
|   // MSG << SEP << std::endl; | ||||
|   // MSG << "Benchmark single precision Lime read" << std::endl; | ||||
|   // MSG << SEP << std::endl; | ||||
|   // for (auto &d: dir) | ||||
|   // { | ||||
|   //   MSG << "-- Directory " << d << std::endl; | ||||
|   //   readBenchmark<LatticeFermionF>(GridDefaultLatt(), d + "/ioBench", limeRead<LatticeFermionF>, Ls, rb); | ||||
|   // } | ||||
|  | ||||
|   Grid_finalize(); | ||||
| #endif | ||||
|  | ||||
|   return EXIT_SUCCESS; | ||||
| } | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
|     /************************************************************************************* | ||||
| /************************************************************************************* | ||||
|  | ||||
|     Grid physics library, www.github.com/paboyle/Grid  | ||||
|  | ||||
| @@ -125,7 +125,7 @@ public: | ||||
| 	      lat*mpi_layout[1], | ||||
| 	      lat*mpi_layout[2], | ||||
| 	      lat*mpi_layout[3]}); | ||||
| 	std::cout << GridLogMessage<< latt_size <<std::endl; | ||||
|  | ||||
| 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||
| 	RealD Nrank = Grid._Nprocessors; | ||||
| 	RealD Nnode = Grid.NodeCount(); | ||||
| @@ -137,8 +137,8 @@ public: | ||||
| 	for(int d=0;d<8;d++){ | ||||
| 	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||
| 	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||
| 	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||
| 	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||
| 	  //	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||
| 	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||
| 	} | ||||
|  | ||||
| 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||
| @@ -202,6 +202,8 @@ public: | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|  | ||||
|    | ||||
|   static void Memory(void) | ||||
|   { | ||||
|     const int Nvec=8; | ||||
| @@ -222,7 +224,7 @@ public: | ||||
|  | ||||
|  | ||||
|   uint64_t lmax=32; | ||||
| #define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat) | ||||
| #define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat) | ||||
|  | ||||
|     GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||
|     for(int lat=8;lat<=lmax;lat+=8){ | ||||
| @@ -247,11 +249,6 @@ public: | ||||
|       double start=usecond(); | ||||
|       for(int i=0;i<Nloop;i++){ | ||||
| 	z=a*x-y; | ||||
| 	autoView( x_v , x, CpuWrite); | ||||
| 	autoView( y_v , y, CpuWrite); | ||||
| 	autoView( z_v , z, CpuRead); | ||||
|         x_v[0]=z_v[0]; // force serial dependency to prevent optimise away | ||||
|         y_v[4]=z_v[4]; | ||||
|       } | ||||
|       double stop=usecond(); | ||||
|       double time = (stop-start)/Nloop*1000; | ||||
| @@ -266,6 +263,61 @@ public: | ||||
|   }; | ||||
|  | ||||
|  | ||||
|   static void SU4(void) | ||||
|   { | ||||
|     const int Nc4=4; | ||||
|     typedef Lattice< iMatrix< vComplexF,Nc4> > LatticeSU4; | ||||
|  | ||||
|     Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); | ||||
|     Coordinate mpi_layout  = GridDefaultMpi(); | ||||
|      | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth"<<std::endl; | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl; | ||||
|     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||
|    | ||||
|     uint64_t NN; | ||||
|  | ||||
|  | ||||
|     uint64_t lmax=32; | ||||
| #define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat) | ||||
|  | ||||
|     GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||
|     for(int lat=8;lat<=lmax;lat+=8){ | ||||
|  | ||||
|       Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||
|       int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||
|  | ||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||
|  | ||||
|       NN =Grid.NodeCount(); | ||||
|  | ||||
|  | ||||
|       LatticeSU4 z(&Grid); z=Zero(); | ||||
|       LatticeSU4 x(&Grid); x=Zero(); | ||||
|       LatticeSU4 y(&Grid); y=Zero(); | ||||
|       double a=2.0; | ||||
|  | ||||
|       uint64_t Nloop=NLOOP; | ||||
|  | ||||
|       double start=usecond(); | ||||
|       for(int i=0;i<Nloop;i++){ | ||||
| 	z=x*y; | ||||
|       } | ||||
|       double stop=usecond(); | ||||
|       double time = (stop-start)/Nloop*1000; | ||||
|       | ||||
|       double flops=vol*Nc4*Nc4*(6+(Nc4-1)*8);// mul,add | ||||
|       double bytes=3.0*vol*Nc4*Nc4*2*sizeof(RealF); | ||||
|       std::cout<<GridLogMessage<<std::setprecision(3)  | ||||
| 	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000. | ||||
| 	       << "\t\t"<< bytes/time/NN <<std::endl; | ||||
|  | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|  | ||||
|   static double DWF(int Ls,int L) | ||||
|   { | ||||
|     RealD mass=0.1; | ||||
| @@ -296,6 +348,7 @@ public: | ||||
|     ///////// Welcome message //////////// | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl; | ||||
|     std::cout<<GridLogMessage << "* Nc             : "<<Nc<<std::endl; | ||||
|     std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl; | ||||
|     std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl; | ||||
|     std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl; | ||||
| @@ -324,7 +377,7 @@ public: | ||||
|     typedef LatticeGaugeFieldF Gauge; | ||||
|      | ||||
|     ///////// Source preparation //////////// | ||||
|     Gauge Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu);  | ||||
|     Gauge Umu(UGrid);  SU<Nc>::HotConfiguration(RNG4,Umu);  | ||||
|     Fermion src   (FGrid); random(RNG5,src); | ||||
|     Fermion src_e (FrbGrid); | ||||
|     Fermion src_o (FrbGrid); | ||||
| @@ -369,7 +422,7 @@ public: | ||||
| 	} | ||||
| 	FGrid->Barrier(); | ||||
| 	double t1=usecond(); | ||||
| 	uint64_t ncall = 50; | ||||
| 	uint64_t ncall = 500; | ||||
|  | ||||
| 	FGrid->Broadcast(0,&ncall,sizeof(ncall)); | ||||
|  | ||||
| @@ -387,7 +440,13 @@ public: | ||||
| 	FGrid->Barrier(); | ||||
| 	 | ||||
| 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||
| 	double flops=(1344.0*volume)/2; | ||||
|  | ||||
| 	// Nc=3 gives | ||||
| 	// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8 | ||||
| 	// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2  + Nd*Nc*Ns*2 | ||||
| 	//	double flops=(1344.0*volume)/2; | ||||
| 	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns  + Nd*Nc*Ns*2; | ||||
| 	double flops=(fps*volume)/2; | ||||
| 	double mf_hi, mf_lo, mf_err; | ||||
|  | ||||
| 	timestat.statistics(t_time); | ||||
| @@ -402,6 +461,7 @@ public: | ||||
| 	if ( mflops>mflops_best ) mflops_best = mflops; | ||||
| 	if ( mflops<mflops_worst) mflops_worst= mflops; | ||||
|  | ||||
| 	std::cout<<GridLogMessage<< "Deo FlopsPerSite is "<<fps<<std::endl; | ||||
| 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl; | ||||
| 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl; | ||||
| 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl; | ||||
| @@ -478,7 +538,7 @@ public: | ||||
|     typedef typename Action::FermionField Fermion;  | ||||
|     typedef LatticeGaugeFieldF Gauge; | ||||
|      | ||||
|     Gauge Umu(FGrid);  SU3::HotConfiguration(RNG4,Umu);  | ||||
|     Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu);  | ||||
|  | ||||
|     typename Action::ImplParams params; | ||||
|     Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params); | ||||
| @@ -596,11 +656,12 @@ int main (int argc, char ** argv) | ||||
| #endif | ||||
|   Benchmark::Decomposition(); | ||||
|  | ||||
|   int do_su4=1; | ||||
|   int do_memory=1; | ||||
|   int do_comms =1; | ||||
|  | ||||
|   int sel=2; | ||||
|   std::vector<int> L_list({16,24,32}); | ||||
|   int sel=4; | ||||
|   std::vector<int> L_list({8,12,16,24,32}); | ||||
|   int selm1=sel-1; | ||||
|  | ||||
|   std::vector<double> wilson; | ||||
| @@ -624,7 +685,6 @@ int main (int argc, char ** argv) | ||||
|     dwf4.push_back(result); | ||||
|   } | ||||
|  | ||||
|   /* | ||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl; | ||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
| @@ -632,14 +692,13 @@ int main (int argc, char ** argv) | ||||
|     double result = Benchmark::Staggered(L_list[l]) ; | ||||
|     staggered.push_back(result); | ||||
|   } | ||||
|   */ | ||||
|  | ||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl; | ||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\tt Staggered" <<std::endl; | ||||
|   for(int l=0;l<L_list.size();l++){ | ||||
|     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] <<std::endl; | ||||
|     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl; | ||||
|   } | ||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|  | ||||
| @@ -651,6 +710,13 @@ int main (int argc, char ** argv) | ||||
|     Benchmark::Memory(); | ||||
|   } | ||||
|  | ||||
|   if ( do_su4 ) { | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << " Memory benchmark " <<std::endl; | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     Benchmark::SU4(); | ||||
|   } | ||||
|    | ||||
|   if ( do_comms && (NN>1) ) { | ||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||
|     std::cout<<GridLogMessage << " Communications benchmark " <<std::endl; | ||||
|   | ||||
| @@ -108,7 +108,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|   std::cout << GridLogMessage << "Drawing gauge field" << std::endl; | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|   SU3::HotConfiguration(RNG4,Umu); | ||||
|   SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|   std::cout << GridLogMessage << "Random gauge initialised " << std::endl; | ||||
| #if 0 | ||||
|   Umu=1.0; | ||||
|   | ||||
							
								
								
									
										364
									
								
								benchmarks/Benchmark_dwf_fp32.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										364
									
								
								benchmarks/Benchmark_dwf_fp32.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,364 @@ | ||||
|  /************************************************************************************* | ||||
|     Grid physics library, www.github.com/paboyle/Grid | ||||
|     Source file: ./benchmarks/Benchmark_dwf.cc | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
|     Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #ifdef GRID_CUDA | ||||
| #define CUDA_PROFILE | ||||
| #endif | ||||
|  | ||||
| #ifdef CUDA_PROFILE | ||||
| #include <cuda_profiler_api.h> | ||||
| #endif | ||||
|  | ||||
| using namespace std; | ||||
| using namespace Grid; | ||||
|  | ||||
| template<class d> | ||||
| struct scal { | ||||
|   d internal; | ||||
| }; | ||||
|  | ||||
|   Gamma::Algebra Gmu [] = { | ||||
|     Gamma::Algebra::GammaX, | ||||
|     Gamma::Algebra::GammaY, | ||||
|     Gamma::Algebra::GammaZ, | ||||
|     Gamma::Algebra::GammaT | ||||
|   }; | ||||
|  | ||||
|  | ||||
| int main (int argc, char ** argv) | ||||
| { | ||||
|   Grid_init(&argc,&argv); | ||||
|  | ||||
|  | ||||
|   int threads = GridThread::GetThreads(); | ||||
|  | ||||
|   Coordinate latt4 = GridDefaultLatt(); | ||||
|   int Ls=8; | ||||
|   for(int i=0;i<argc;i++) | ||||
|     if(std::string(argv[i]) == "-Ls"){ | ||||
|       std::stringstream ss(argv[i+1]); ss >> Ls; | ||||
|     } | ||||
|  | ||||
|   GridLogLayout(); | ||||
|  | ||||
|   long unsigned int single_site_flops = 8*Nc*(7+16*Nc); | ||||
|  | ||||
|  | ||||
|   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); | ||||
|   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||
|   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||
|  | ||||
|   std::cout << GridLogMessage << "Making s innermost grids"<<std::endl; | ||||
|   GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi()); | ||||
|   GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid); | ||||
|   GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid); | ||||
|   GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid); | ||||
|  | ||||
|   std::vector<int> seeds4({1,2,3,4}); | ||||
|   std::vector<int> seeds5({5,6,7,8}); | ||||
|  | ||||
|   std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; | ||||
|   GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG")); | ||||
|   std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; | ||||
|   GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG")); | ||||
|   std::cout << GridLogMessage << "Initialised RNGs" << std::endl; | ||||
|  | ||||
|   LatticeFermionF src   (FGrid); random(RNG5,src); | ||||
| #if 0 | ||||
|   src = Zero(); | ||||
|   { | ||||
|     Coordinate origin({0,0,0,latt4[2]-1,0}); | ||||
|     SpinColourVectorF tmp; | ||||
|     tmp=Zero(); | ||||
|     tmp()(0)(0)=Complex(-2.0,0.0); | ||||
|     std::cout << " source site 0 " << tmp<<std::endl; | ||||
|     pokeSite(tmp,src,origin); | ||||
|   } | ||||
| #else | ||||
|   RealD N2 = 1.0/::sqrt(norm2(src)); | ||||
|   src = src*N2; | ||||
| #endif | ||||
|  | ||||
|  | ||||
|   LatticeFermionF result(FGrid); result=Zero(); | ||||
|   LatticeFermionF    ref(FGrid);    ref=Zero(); | ||||
|   LatticeFermionF    tmp(FGrid); | ||||
|   LatticeFermionF    err(FGrid); | ||||
|  | ||||
|   std::cout << GridLogMessage << "Drawing gauge field" << std::endl; | ||||
|   LatticeGaugeFieldF Umu(UGrid); | ||||
|   SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|   std::cout << GridLogMessage << "Random gauge initialised " << std::endl; | ||||
| #if 0 | ||||
|   Umu=1.0; | ||||
|   for(int mu=0;mu<Nd;mu++){ | ||||
|     LatticeColourMatrixF ttmp(UGrid); | ||||
|     ttmp = PeekIndex<LorentzIndex>(Umu,mu); | ||||
|     //    if (mu !=2 ) ttmp = 0; | ||||
|     //    ttmp = ttmp* pow(10.0,mu); | ||||
|     PokeIndex<LorentzIndex>(Umu,ttmp,mu); | ||||
|   } | ||||
|   std::cout << GridLogMessage << "Forced to diagonal " << std::endl; | ||||
| #endif | ||||
|  | ||||
|   //////////////////////////////////// | ||||
|   // Naive wilson implementation | ||||
|   //////////////////////////////////// | ||||
|   // replicate across fifth dimension | ||||
|   LatticeGaugeFieldF Umu5d(FGrid); | ||||
|   std::vector<LatticeColourMatrixF> U(4,FGrid); | ||||
|   { | ||||
|     autoView( Umu5d_v, Umu5d, CpuWrite); | ||||
|     autoView( Umu_v  , Umu  , CpuRead); | ||||
|     for(int ss=0;ss<Umu.Grid()->oSites();ss++){ | ||||
|       for(int s=0;s<Ls;s++){ | ||||
| 	Umu5d_v[Ls*ss+s] = Umu_v[ss]; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|   for(int mu=0;mu<Nd;mu++){ | ||||
|     U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu); | ||||
|   } | ||||
|   std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; | ||||
|  | ||||
|   if (1) | ||||
|   { | ||||
|     ref = Zero(); | ||||
|     for(int mu=0;mu<Nd;mu++){ | ||||
|  | ||||
|       tmp = U[mu]*Cshift(src,mu+1,1); | ||||
|       ref=ref + tmp - Gamma(Gmu[mu])*tmp; | ||||
|  | ||||
|       tmp =adj(U[mu])*src; | ||||
|       tmp =Cshift(tmp,mu+1,-1); | ||||
|       ref=ref + tmp + Gamma(Gmu[mu])*tmp; | ||||
|     } | ||||
|     ref = -0.5*ref; | ||||
|   } | ||||
|  | ||||
|   RealD mass=0.1; | ||||
|   RealD M5  =1.8; | ||||
|  | ||||
|   RealD NP = UGrid->_Nprocessors; | ||||
|   RealD NN = UGrid->NodeCount(); | ||||
|  | ||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl; | ||||
|   std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl; | ||||
|   std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl; | ||||
|   if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||
|   if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||
| #ifdef GRID_OMP | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||
| #endif | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||
|  | ||||
|   DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); | ||||
|   int ncall =1000; | ||||
|  | ||||
|   if (1) { | ||||
|     FGrid->Barrier(); | ||||
|     Dw.ZeroCounters(); | ||||
|     Dw.Dhop(src,result,0); | ||||
|     std::cout<<GridLogMessage<<"Called warmup"<<std::endl; | ||||
|     double t0=usecond(); | ||||
|     for(int i=0;i<ncall;i++){ | ||||
|       __SSC_START; | ||||
|       Dw.Dhop(src,result,0); | ||||
|       __SSC_STOP; | ||||
|     } | ||||
|     double t1=usecond(); | ||||
|     FGrid->Barrier(); | ||||
|  | ||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||
|     double flops=single_site_flops*volume*ncall; | ||||
|  | ||||
|     auto nsimd = vComplex::Nsimd(); | ||||
|     auto simdwidth = sizeof(vComplex); | ||||
|  | ||||
|     // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors | ||||
|     double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.); | ||||
|  | ||||
|     // mem: Nd Wilson * Ls, Nd gauge, Nc colors | ||||
|     double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.); | ||||
|  | ||||
|     std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; | ||||
|     //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; | ||||
|     //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl; | ||||
|     std::cout<<GridLogMessage << "RF  GiB/s (base 2) =   "<< 1000000. * data_rf/((t1-t0))<<std::endl; | ||||
|     std::cout<<GridLogMessage << "mem GiB/s (base 2) =   "<< 1000000. * data_mem/((t1-t0))<<std::endl; | ||||
|     err = ref-result; | ||||
|     std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||
|     //exit(0); | ||||
|  | ||||
|     if(( norm2(err)>1.0e-4) ) { | ||||
|       /* | ||||
|       std::cout << "RESULT\n " << result<<std::endl; | ||||
|       std::cout << "REF   \n " << ref   <<std::endl; | ||||
|       std::cout << "ERR   \n " << err   <<std::endl; | ||||
|       */ | ||||
|       std::cout<<GridLogMessage << "WRONG RESULT" << std::endl; | ||||
|       FGrid->Barrier(); | ||||
|       exit(-1); | ||||
|     } | ||||
|     assert (norm2(err)< 1.0e-4 ); | ||||
|     Dw.Report(); | ||||
|   } | ||||
|  | ||||
|   if (1) | ||||
|   { // Naive wilson dag implementation | ||||
|     ref = Zero(); | ||||
|     for(int mu=0;mu<Nd;mu++){ | ||||
|  | ||||
|       //    ref =  src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x | ||||
|       tmp = U[mu]*Cshift(src,mu+1,1); | ||||
|       { | ||||
| 	autoView( ref_v, ref, CpuWrite); | ||||
| 	autoView( tmp_v, tmp, CpuRead); | ||||
| 	for(int i=0;i<ref_v.size();i++){ | ||||
| 	  ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ; | ||||
| 	} | ||||
|       } | ||||
|  | ||||
|       tmp =adj(U[mu])*src; | ||||
|       tmp =Cshift(tmp,mu+1,-1); | ||||
|       { | ||||
| 	autoView( ref_v, ref, CpuWrite); | ||||
| 	autoView( tmp_v, tmp, CpuRead); | ||||
| 	for(int i=0;i<ref_v.size();i++){ | ||||
| 	  ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ; | ||||
| 	} | ||||
|       } | ||||
|     } | ||||
|     ref = -0.5*ref; | ||||
|   } | ||||
|   //  dump=1; | ||||
|   Dw.Dhop(src,result,1); | ||||
|   std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl; | ||||
|   std::cout<<GridLogMessage << "Called DwDag"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "norm dag ref    "<< norm2(ref)<<std::endl; | ||||
|   err = ref-result; | ||||
|   std::cout<<GridLogMessage << "norm dag diff   "<< norm2(err)<<std::endl; | ||||
|   if((norm2(err)>1.0e-4)){ | ||||
| /* | ||||
| 	std::cout<< "DAG RESULT\n "  <<ref     << std::endl; | ||||
| 	std::cout<< "DAG sRESULT\n " <<result  << std::endl; | ||||
| 	std::cout<< "DAG ERR   \n "  << err    <<std::endl; | ||||
| */ | ||||
|   } | ||||
|   LatticeFermionF src_e (FrbGrid); | ||||
|   LatticeFermionF src_o (FrbGrid); | ||||
|   LatticeFermionF r_e   (FrbGrid); | ||||
|   LatticeFermionF r_o   (FrbGrid); | ||||
|   LatticeFermionF r_eo  (FGrid); | ||||
|  | ||||
|   std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl; | ||||
|   pickCheckerboard(Even,src_e,src); | ||||
|   pickCheckerboard(Odd,src_o,src); | ||||
|  | ||||
|   std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl; | ||||
|  | ||||
|  | ||||
|   // S-direction is INNERMOST and takes no part in the parity. | ||||
|   std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO                "<<std::endl; | ||||
|   std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl; | ||||
|   if ( sizeof(RealF)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||
|   if ( sizeof(RealF)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||
| #ifdef GRID_OMP | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||
| #endif | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||
|   if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||
|   { | ||||
|     Dw.ZeroCounters(); | ||||
|     FGrid->Barrier(); | ||||
|     Dw.DhopEO(src_o,r_e,DaggerNo); | ||||
|     double t0=usecond(); | ||||
|     for(int i=0;i<ncall;i++){ | ||||
| #ifdef CUDA_PROFILE | ||||
|       if(i==10) cudaProfilerStart(); | ||||
| #endif | ||||
|       Dw.DhopEO(src_o,r_e,DaggerNo); | ||||
| #ifdef CUDA_PROFILE | ||||
|       if(i==20) cudaProfilerStop(); | ||||
| #endif | ||||
|     } | ||||
|     double t1=usecond(); | ||||
|     FGrid->Barrier(); | ||||
|  | ||||
|     double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||
|     double flops=(single_site_flops*volume*ncall)/2.0; | ||||
|  | ||||
|     std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl; | ||||
|     std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl; | ||||
|     std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NN<<std::endl; | ||||
|     Dw.Report(); | ||||
|   } | ||||
|   Dw.DhopEO(src_o,r_e,DaggerNo); | ||||
|   Dw.DhopOE(src_e,r_o,DaggerNo); | ||||
|   Dw.Dhop  (src  ,result,DaggerNo); | ||||
|  | ||||
|   std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl; | ||||
|  | ||||
|   setCheckerboard(r_eo,r_o); | ||||
|   setCheckerboard(r_eo,r_e); | ||||
|  | ||||
|   err = r_eo-result; | ||||
|   std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||
|   if((norm2(err)>1.0e-4)){ | ||||
|     /* | ||||
| 	std::cout<< "Deo RESULT\n " <<r_eo << std::endl; | ||||
| 	std::cout<< "Deo REF\n " <<result  << std::endl; | ||||
| 	std::cout<< "Deo ERR   \n " << err <<std::endl; | ||||
|     */ | ||||
|   } | ||||
|  | ||||
|   pickCheckerboard(Even,src_e,err); | ||||
|   pickCheckerboard(Odd,src_o,err); | ||||
|   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl; | ||||
|   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl; | ||||
|  | ||||
|   assert(norm2(src_e)<1.0e-4); | ||||
|   assert(norm2(src_o)<1.0e-4); | ||||
|   Grid_finalize(); | ||||
|   exit(0); | ||||
| } | ||||
| @@ -63,7 +63,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|   std::cout << GridLogMessage << "Drawing gauge field" << std::endl; | ||||
|   LatticeGaugeFieldF Umu(UGrid);  | ||||
|   SU3::HotConfiguration(RNG4,Umu);  | ||||
|   SU<Nc>::HotConfiguration(RNG4,Umu);  | ||||
|   std::cout << GridLogMessage << "Random gauge initialised " << std::endl; | ||||
|  | ||||
|   RealD mass=0.1; | ||||
|   | ||||
| @@ -30,7 +30,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
| using namespace std; | ||||
| using namespace Grid; | ||||
|  ; | ||||
|  | ||||
|  | ||||
|  | ||||
| int main (int argc, char ** argv) | ||||
| @@ -53,7 +53,7 @@ int main (int argc, char ** argv) | ||||
|   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); | ||||
|   std::cout << GridLogMessage << "Seeded"<<std::endl; | ||||
|  | ||||
|   LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); | ||||
|   LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|  | ||||
|   std::cout << GridLogMessage << "made random gauge fields"<<std::endl; | ||||
|  | ||||
|   | ||||
| @@ -36,12 +36,12 @@ int main (int argc, char ** argv) | ||||
| { | ||||
|   Grid_init(&argc,&argv); | ||||
|  | ||||
| #define LMAX (48) | ||||
| #define LMAX (40) | ||||
| #define LMIN (8) | ||||
| #define LADD (8) | ||||
|  | ||||
|   int64_t Nwarm=50; | ||||
|   int64_t Nloop=500; | ||||
|   int64_t Nwarm=10; | ||||
|   int64_t Nloop=100; | ||||
|  | ||||
|   Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); | ||||
|   Coordinate mpi_layout  = GridDefaultMpi(); | ||||
| @@ -118,6 +118,41 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|     } | ||||
|  | ||||
|  | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z=z+ x*y"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||
|  | ||||
|   for(int lat=LMIN;lat<=LMAX;lat+=LADD){ | ||||
|  | ||||
|       Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||
|       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||
|  | ||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||
|       GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||
|  | ||||
|       LatticeColourMatrix z(&Grid); random(pRNG,z); | ||||
|       LatticeColourMatrix x(&Grid); random(pRNG,x); | ||||
|       LatticeColourMatrix y(&Grid); random(pRNG,y); | ||||
|  | ||||
|       for(int64_t i=0;i<Nwarm;i++){ | ||||
| 	z=z+x*y; | ||||
|       } | ||||
|       double start=usecond(); | ||||
|       for(int64_t i=0;i<Nloop;i++){ | ||||
| 	z=z+x*y; | ||||
|       } | ||||
|       double stop=usecond(); | ||||
|       double time = (stop-start)/Nloop*1000.0; | ||||
|        | ||||
|       double bytes=4*vol*Nc*Nc*sizeof(Complex); | ||||
|       double flops=Nc*Nc*(6+8+8)*vol; | ||||
|       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"    \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl; | ||||
|  | ||||
|     } | ||||
|  | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  mult(z,x,y)"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
| @@ -143,7 +178,6 @@ int main (int argc, char ** argv) | ||||
|       double start=usecond(); | ||||
|       for(int64_t i=0;i<Nloop;i++){ | ||||
| 	mult(z,x,y); | ||||
| 	//	mac(z,x,y); | ||||
|       } | ||||
|       double stop=usecond(); | ||||
|       double time = (stop-start)/Nloop*1000.0; | ||||
| @@ -191,7 +225,7 @@ int main (int argc, char ** argv) | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  CovShiftForward(z,x,y)"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GB/s (incl Cshift)\t\t GFlop/s"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||
|  | ||||
|   for(int lat=LMIN;lat<=LMAX;lat+=LADD){ | ||||
| @@ -216,16 +250,15 @@ int main (int argc, char ** argv) | ||||
| 	 | ||||
| 	 | ||||
| 	    double bytes=3*vol*Nc*Nc*sizeof(Complex); | ||||
| 	    double ncbytes=5*vol*Nc*Nc*sizeof(Complex); | ||||
| 	    double flops=Nc*Nc*(6+8+8)*vol; | ||||
| 	    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<ncbytes/time<<"\t\t" << flops/time<<std::endl; | ||||
| 	    std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl; | ||||
|       } | ||||
|   } | ||||
| #if 1 | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "= Benchmarking SU3xSU3  z= x * Cshift(y)"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GB/s (incl Cshift)\t\t GFlop/s"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl; | ||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||
|  | ||||
|   for(int lat=LMIN;lat<=LMAX;lat+=LADD){ | ||||
| @@ -259,11 +292,10 @@ int main (int argc, char ** argv) | ||||
| 	tmult  = tmult /Nloop; | ||||
| 	 | ||||
| 	double bytes=3*vol*Nc*Nc*sizeof(Complex); | ||||
| 	double ncbytes=5*vol*Nc*Nc*sizeof(Complex); | ||||
| 	double flops=Nc*Nc*(6+8+8)*vol; | ||||
| 	std::cout<<GridLogMessage<<std::setprecision(3) << "total us "<<time<<" shift "<<tshift <<" mult "<<tmult<<std::endl; | ||||
| 	time = time * 1000; // convert to NS for GB/s | ||||
| 	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" <<ncbytes/time<<"\t\t" << flops/time<<std::endl; | ||||
| 	std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl; | ||||
|       } | ||||
|     } | ||||
| #endif | ||||
|   | ||||
| @@ -187,7 +187,8 @@ int main (int argc, char ** argv) | ||||
| 	  auto xx = coalescedRead(x_v[ss]); | ||||
| 	  auto yy = coalescedRead(y_v[ss]); | ||||
| 	  auto zz = coalescedRead(z_v[ss]); | ||||
| 	  zz = zz+xx*yy; | ||||
| 	  //zz = zz+xx*yy; | ||||
| 	  mac(&zz,&xx,&yy); | ||||
| 	  coalescedWrite(z_v[ss],zz); | ||||
|         }); | ||||
|       } | ||||
|   | ||||
							
								
								
									
										60
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										60
									
								
								configure.ac
									
									
									
									
									
								
							| @@ -123,6 +123,24 @@ case ${ac_LAPACK} in | ||||
|         AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; | ||||
| esac | ||||
|  | ||||
| ############### Nc | ||||
| AC_ARG_ENABLE([Nc], | ||||
|     [AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])], | ||||
|     [ac_Nc=${enable_Nc}], [ac_Nc=3]) | ||||
|  | ||||
| case ${ac_Nc} in | ||||
|     2) | ||||
|         AC_DEFINE([Config_Nc],[2],[Gauge group Nc]);; | ||||
|     3) | ||||
|         AC_DEFINE([Config_Nc],[3],[Gauge group Nc]);; | ||||
|     4) | ||||
|         AC_DEFINE([Config_Nc],[4],[Gauge group Nc]);; | ||||
|     5) | ||||
|         AC_DEFINE([Config_Nc],[5],[Gauge group Nc]);; | ||||
|     *) | ||||
|       AC_MSG_ERROR(["Unsupport gauge group choice Nc = ${ac_Nc}"]);; | ||||
| esac | ||||
|  | ||||
| ############### FP16 conversions | ||||
| AC_ARG_ENABLE([sfw-fp16], | ||||
|     [AC_HELP_STRING([--enable-sfw-fp16=yes|no], [enable software fp16 comms])], | ||||
| @@ -330,12 +348,18 @@ case ${CXXTEST} in | ||||
|     fi | ||||
|     ;; | ||||
|   hipcc) | ||||
|     CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr" | ||||
| #    CXXFLAGS="$CXXFLAGS -Xcompiler -fno-strict-aliasing --expt-extended-lambda --expt-relaxed-constexpr" | ||||
|     CXXFLAGS="$CXXFLAGS -fno-strict-aliasing" | ||||
|     CXXLD=${CXX} | ||||
|     if test $ac_openmp = yes; then | ||||
|        CXXFLAGS="$CXXFLAGS -Xcompiler -fopenmp" | ||||
|     fi | ||||
|     ;; | ||||
|   dpcpp) | ||||
|     LDFLAGS="$LDFLAGS" | ||||
|     CXXFLAGS="$CXXFLAGS" | ||||
|     CXXLD=${CXX} | ||||
|     ;; | ||||
|   *) | ||||
|     CXXLD=${CXX} | ||||
|     CXXFLAGS="$CXXFLAGS -fno-strict-aliasing" | ||||
| @@ -453,23 +477,24 @@ esac | ||||
| AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS" | ||||
| AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS" | ||||
|  | ||||
| ############### Precision selection | ||||
| AC_ARG_ENABLE([precision], | ||||
|               [AC_HELP_STRING([--enable-precision=single|double], | ||||
|                               [Select default word size of Real])], | ||||
|               [ac_PRECISION=${enable_precision}],[ac_PRECISION=double]) | ||||
| ############### Precision selection - deprecate | ||||
| #AC_ARG_ENABLE([precision], | ||||
| #              [AC_HELP_STRING([--enable-precision=single|double], | ||||
| #                              [Select default word size of Real])], | ||||
| #              [ac_PRECISION=${enable_precision}],[ac_PRECISION=double]) | ||||
|  | ||||
| case ${ac_PRECISION} in | ||||
|      single) | ||||
|        AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] ) | ||||
|      ;; | ||||
|      double) | ||||
|        AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] ) | ||||
|      ;; | ||||
|      *) | ||||
|      AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]); | ||||
|      ;; | ||||
| esac | ||||
| AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] ) | ||||
|  | ||||
| #case ${ac_PRECISION} in | ||||
| #     single) | ||||
| #       AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] ) | ||||
| #     ;; | ||||
| #     double) | ||||
| #     ;; | ||||
| #     *) | ||||
| #     AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]); | ||||
| #     ;; | ||||
| #esac | ||||
|  | ||||
| ######################  Shared memory allocation technique under MPI3 | ||||
| AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone], | ||||
| @@ -650,6 +675,7 @@ os (target)                 : $target_os | ||||
| compiler vendor             : ${ax_cv_cxx_compiler_vendor} | ||||
| compiler version            : ${ax_cv_gxx_version} | ||||
| ----- BUILD OPTIONS ----------------------------------- | ||||
| Nc                          : ${ac_Nc} | ||||
| SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG} | ||||
| Threading                   : ${ac_openmp} | ||||
| Acceleration                : ${ac_ACCELERATOR} | ||||
|   | ||||
| @@ -184,19 +184,19 @@ Below are shown the `configure` script invocations for three recommended configu | ||||
|  | ||||
| This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging: | ||||
|  | ||||
|     ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridDebug --enable-comms=none | ||||
|     ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --prefix=$GridPre/Debug | ||||
|  | ||||
| #### 2. `Release` | ||||
|  | ||||
| Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation): | ||||
| Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`: | ||||
|  | ||||
|     ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=single --prefix=$GridPre/GridRelease --enable-comms=none | ||||
|     ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --prefix=$GridPre/Release | ||||
|  | ||||
| #### 3. `MPIDebug` | ||||
|  | ||||
| Debug configuration with MPI: | ||||
|  | ||||
|     ../configure CXX=clang++ --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-precision=double --prefix=$GridPre/GridMPIDebug --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx | ||||
|     ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --prefix=$GridPre/MPIDebug | ||||
|  | ||||
| ### 5.3 Build Grid | ||||
|  | ||||
|   | ||||
| @@ -178,15 +178,10 @@ Then enter the cloned directory and set up the build system:: | ||||
| Now you can execute the `configure` script to generate makefiles (here from a build directory):: | ||||
|  | ||||
|   mkdir build; cd build | ||||
|   ../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto \ | ||||
|   ../configure --enable-simd=AVX --enable-comms=mpi-auto \ | ||||
|       --prefix=<path> | ||||
|  | ||||
| where:: | ||||
|  | ||||
|   --enable-precision=single|double | ||||
|  | ||||
| sets the **default precision**. Since this is largely a benchmarking convenience, it is anticipated that the default precision may be removed in future implementations, | ||||
| and that explicit type selection be made at all points. Naturally, most code will be type templated in any case.:: | ||||
| :: | ||||
|  | ||||
|    --enable-simd=GEN|SSE4|AVX|AVXFMA|AVXFMA4|AVX2|AVX512|NEONv8|QPX | ||||
|  | ||||
| @@ -236,7 +231,7 @@ Detailed build configuration options | ||||
|   --enable-mkl[=path]                     use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional). | ||||
|   --enable-simd=code                      setup Grid for the SIMD target `<code>`(default: `GEN`). A list of possible SIMD targets is detailed in a section below. | ||||
|   --enable-gen-simd-width=size            select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). E.g. SSE 128 bit corresponds to 16 bytes. | ||||
|   --enable-precision=single|double        set the default precision (default: `double`). | ||||
|   --enable-precision=single|double        set the default precision (default: `double`). **Deprecated option** | ||||
|   --enable-comms=mpi|none                 use `<comm>` for message passing (default: `none`). | ||||
|   --enable-rng=sitmo|ranlux48|mt19937     choose the RNG (default: `sitmo`). | ||||
|   --disable-timers                        disable system dependent high-resolution timers. | ||||
| @@ -304,8 +299,7 @@ Build setup for Intel Knights Landing platform | ||||
|  | ||||
| The following configuration is recommended for the Intel Knights Landing platform:: | ||||
|  | ||||
|   ../configure --enable-precision=double\ | ||||
|              --enable-simd=KNL        \ | ||||
|   ../configure --enable-simd=KNL        \ | ||||
|              --enable-comms=mpi-auto  \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=icpc MPICXX=mpiicpc | ||||
| @@ -314,8 +308,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. | ||||
|  | ||||
| If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:: | ||||
|  | ||||
|   ../configure --enable-precision=double\ | ||||
|              --enable-simd=KNL        \ | ||||
|   ../configure --enable-simd=KNL        \ | ||||
|              --enable-comms=mpi       \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=CC CC=cc | ||||
| @@ -332,8 +325,7 @@ presently performs better with use of more than one rank per node, using shared | ||||
| for interior communication. | ||||
| We recommend four ranks per node for best performance, but optimum is local volume dependent. :: | ||||
|  | ||||
|    ../configure --enable-precision=double\ | ||||
|              --enable-simd=KNL        \ | ||||
|    ../configure --enable-simd=KNL        \ | ||||
|              --enable-comms=mpi-auto \ | ||||
|              --enable-mkl             \ | ||||
|              CC=icpc MPICXX=mpiicpc  | ||||
| @@ -343,8 +335,7 @@ Build setup for Intel Haswell Xeon platform | ||||
|  | ||||
| The following configuration is recommended for the Intel Haswell platform:: | ||||
|  | ||||
|   ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX2       \ | ||||
|   ../configure --enable-simd=AVX2       \ | ||||
|              --enable-comms=mpi-auto \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=icpc MPICXX=mpiicpc | ||||
| @@ -360,8 +351,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed. | ||||
|  | ||||
| If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:: | ||||
|  | ||||
|   ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX2       \ | ||||
|   ../configure --enable-simd=AVX2       \ | ||||
|              --enable-comms=mpi      \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=CC CC=cc | ||||
| @@ -379,8 +369,7 @@ Build setup for Intel Skylake Xeon platform | ||||
|  | ||||
| The following configuration is recommended for the Intel Skylake platform:: | ||||
|  | ||||
|   ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX512     \ | ||||
|   ../configure --enable-simd=AVX512     \ | ||||
|              --enable-comms=mpi      \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=mpiicpc | ||||
| @@ -396,8 +385,7 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed. | ||||
|  | ||||
| If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:: | ||||
|  | ||||
|   ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX512     \ | ||||
|   ../configure --enable-simd=AVX512     \ | ||||
|              --enable-comms=mpi      \ | ||||
|              --enable-mkl             \ | ||||
|              CXX=CC CC=cc | ||||
| @@ -422,8 +410,7 @@ and 8 threads per rank. | ||||
| The following configuration is recommended for the AMD EPYC platform:: | ||||
|  | ||||
|  | ||||
|   ../configure --enable-precision=double\ | ||||
|              --enable-simd=AVX2       \ | ||||
|   ../configure --enable-simd=AVX2       \ | ||||
|              --enable-comms=mpi \ | ||||
|              CXX=mpicxx  | ||||
|  | ||||
|   | ||||
| @@ -69,7 +69,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|   std::vector<LatticeColourMatrix> U(4,&Fine); | ||||
|    | ||||
|   SU3::HotConfiguration(pRNGa,Umu); | ||||
|   SU<Nc>::HotConfiguration(pRNGa,Umu); | ||||
|  | ||||
|  | ||||
|   FieldMetaData header; | ||||
|   | ||||
| @@ -84,7 +84,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|   std::vector<LatticeColourMatrix> U(4,&Fine); | ||||
|    | ||||
|   SU3::HotConfiguration(pRNGa,Umu); | ||||
|   SU<Nc>::HotConfiguration(pRNGa,Umu); | ||||
|  | ||||
|   FieldMetaData header; | ||||
|   std::string file("./ckpoint_lat.4000"); | ||||
|   | ||||
| @@ -80,7 +80,7 @@ int main (int argc, char ** argv) | ||||
|   GridParallelRNG          sRNG5(sFGrid);  sRNG5.SeedFixedIntegers(seeds5); | ||||
|  | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|   SU3::HotConfiguration(RNG4,Umu); | ||||
|   SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|  | ||||
|   RealD mass=0.1; | ||||
|   RealD M5  =1.8; | ||||
|   | ||||
| @@ -202,7 +202,7 @@ int main (int argc, char ** argv) { | ||||
|   std::vector<int> seeds4({1,2,3,4}); | ||||
|   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|   SU3::HotConfiguration(RNG4,Umu); | ||||
|   SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|   //  FieldMetaData header; | ||||
|   //  NerscIO::readConfiguration(Umu,header,Params.config); | ||||
|  | ||||
|   | ||||
| @@ -71,7 +71,7 @@ int main (int argc, char ** argv) | ||||
|   LatticeGaugeFieldD Umu(UGrid); | ||||
|   LatticeGaugeFieldF Umu_f(UGrid_f);  | ||||
|    | ||||
|   SU3::HotConfiguration(RNG4,Umu); | ||||
|   SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|  | ||||
|   precisionChange(Umu_f,Umu); | ||||
|    | ||||
|   | ||||
| @@ -69,7 +69,7 @@ int main (int argc, char ** argv) | ||||
|   LatticeGaugeFieldD Umu(UGrid); | ||||
|   LatticeGaugeFieldF Umu_f(UGrid_f);  | ||||
|    | ||||
|   SU3::HotConfiguration(RNG4,Umu); | ||||
|   SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|  | ||||
|   precisionChange(Umu_f,Umu); | ||||
|    | ||||
|   | ||||
| @@ -64,7 +64,7 @@ int main (int argc, char ** argv) | ||||
|   LatticeFermion    ref(FGrid); ref=Zero(); | ||||
|   LatticeFermion    tmp(FGrid); | ||||
|   LatticeFermion    err(FGrid); | ||||
|   LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); | ||||
|   LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|  | ||||
|   std::vector<LatticeColourMatrix> U(4,UGrid); | ||||
|   for(int mu=0;mu<Nd;mu++){ | ||||
|   | ||||
| @@ -131,7 +131,7 @@ int main (int argc, char ** argv) | ||||
|   // LatticeFermion result(FGrid); result=Zero(); | ||||
|   // LatticeGaugeField Umu(UGrid);  | ||||
|  | ||||
|   // SU3::HotConfiguration(RNG4,Umu); | ||||
|   // SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|  | ||||
|   // std::vector<LatticeColourMatrix> U(4,UGrid); | ||||
|   // for(int mu=0;mu<Nd;mu++){ | ||||
|   | ||||
| @@ -69,7 +69,7 @@ int main (int argc, char ** argv) | ||||
|   GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5); | ||||
|   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); | ||||
|  | ||||
|   LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); | ||||
|   LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|   std::vector<LatticeColourMatrix> U(4,UGrid); | ||||
|  | ||||
|   RealD mass=0.1; | ||||
|   | ||||
| @@ -73,7 +73,7 @@ int main (int argc, char ** argv) | ||||
|     LatticeFermion    ref   (FGrid); ref = Zero(); | ||||
|     LatticeFermion    tmp   (FGrid); tmp = Zero(); | ||||
|     LatticeFermion    err   (FGrid); err = Zero(); | ||||
|     LatticeGaugeField Umu   (UGrid); SU3::HotConfiguration(RNG4, Umu); | ||||
|     LatticeGaugeField Umu   (UGrid); SU<Nc>::HotConfiguration(RNG4, Umu); | ||||
|     std::vector<LatticeColourMatrix> U(4,UGrid); | ||||
|  | ||||
|     // Only one non-zero (y) | ||||
|   | ||||
| @@ -72,7 +72,7 @@ int main (int argc, char ** argv) | ||||
|   LatticeFermion    ref(FGrid);    ref=Zero(); | ||||
|   LatticeFermion    tmp(FGrid);    tmp=Zero(); | ||||
|   LatticeFermion    err(FGrid);    tmp=Zero(); | ||||
|   LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); | ||||
|   LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|   std::vector<LatticeColourMatrix> U(4,UGrid); | ||||
|  | ||||
|   // Only one non-zero (y) | ||||
|   | ||||
| @@ -29,91 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #include <Grid/Grid.h> | ||||
|  | ||||
| using namespace Grid; | ||||
|  | ||||
| void MomentumSpacePropagatorTest(RealD mass,RealD M5, LatticePropagator &prop) | ||||
| { | ||||
|   // what type LatticeComplex  | ||||
|   GridBase *_grid = prop.Grid(); | ||||
|    | ||||
|   typedef LatticeFermion FermionField; | ||||
|   typedef LatticePropagator PropagatorField; | ||||
|   typedef typename FermionField::vector_type vector_type; | ||||
|   typedef typename FermionField::scalar_type ScalComplex; | ||||
|   typedef iSinglet<ScalComplex> Tcomplex; | ||||
|   typedef Lattice<iSinglet<vector_type> > LatComplex; | ||||
|    | ||||
|   Gamma::Algebra Gmu [] = { | ||||
|     Gamma::Algebra::GammaX, | ||||
|     Gamma::Algebra::GammaY, | ||||
|     Gamma::Algebra::GammaZ, | ||||
|     Gamma::Algebra::GammaT | ||||
|   }; | ||||
|  | ||||
|   Coordinate latt_size   = _grid->_fdimensions; | ||||
|    | ||||
|   PropagatorField   num  (_grid); num  = Zero(); | ||||
|  | ||||
|   LatComplex    sk(_grid);  sk = Zero(); | ||||
|   LatComplex    sk2(_grid); sk2= Zero(); | ||||
|   LatComplex    W(_grid); W= Zero(); | ||||
|   LatComplex    a(_grid); a= Zero(); | ||||
|   LatComplex    one  (_grid); one = ScalComplex(1.0,0.0); | ||||
|   LatComplex denom(_grid); denom= Zero(); | ||||
|   LatComplex cosha(_grid);  | ||||
|   LatComplex kmu(_grid);  | ||||
|   LatComplex Wea(_grid);  | ||||
|   LatComplex Wema(_grid);  | ||||
|  | ||||
|   ScalComplex ci(0.0,1.0); | ||||
|   SpinColourMatrixD identity = ComplexD(1.0); | ||||
|  | ||||
|   for(int mu=0;mu<Nd;mu++) { | ||||
|      | ||||
|     LatticeCoordinate(kmu,mu); | ||||
|      | ||||
|     RealD TwoPiL =  M_PI * 2.0/ latt_size[mu]; | ||||
|      | ||||
|     kmu = TwoPiL * kmu; | ||||
|     //    kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions | ||||
|      | ||||
|     sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5); | ||||
|     sk  = sk  +     sin(kmu)    *sin(kmu);  | ||||
|      | ||||
|     num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*identity); | ||||
|      | ||||
|   } | ||||
|    | ||||
|   W = one - M5 + sk2; | ||||
|  | ||||
|   //////////////////////////////////////////// | ||||
|   // Cosh alpha -> alpha | ||||
|   //////////////////////////////////////////// | ||||
|   cosha =  (one + W*W + sk) / (abs(W)*2.0); | ||||
|  | ||||
|   // FIXME Need a Lattice acosh | ||||
|   { | ||||
|   autoView(cosha_v,cosha,CpuRead); | ||||
|   autoView(a_v,a,CpuWrite); | ||||
|   for(int idx=0;idx<_grid->lSites();idx++){ | ||||
|     Coordinate lcoor(Nd); | ||||
|     Tcomplex cc; | ||||
|     //    RealD sgn; | ||||
|     _grid->LocalIndexToLocalCoor(idx,lcoor); | ||||
|     peekLocalSite(cc,cosha_v,lcoor); | ||||
|     assert((double)real(cc)>=1.0); | ||||
|     assert(fabs((double)imag(cc))<=1.0e-15); | ||||
|     cc = ScalComplex(::acosh(real(cc)),0.0); | ||||
|     pokeLocalSite(cc,a_v,lcoor); | ||||
|   }} | ||||
|    | ||||
|   Wea = ( exp( a) * abs(W)  ); | ||||
|   Wema= ( exp(-a) * abs(W)  ); | ||||
|    | ||||
|   num   = num + ( one - Wema ) * mass * identity; | ||||
|   denom= ( Wea - one ) + mass*mass * (one - Wema);  | ||||
|   prop = num/denom; | ||||
| } | ||||
|  | ||||
|  ; | ||||
|  | ||||
| int main (int argc, char ** argv) | ||||
| { | ||||
| @@ -222,7 +138,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|   LatticeGaugeFieldD Umu(&GRID); | ||||
|  | ||||
|   SU3::ColdConfiguration(pRNG,Umu); // Unit gauge | ||||
|   SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge | ||||
|   //  Umu=Zero(); | ||||
|   //////////////////////////////////////////////////// | ||||
|   // Wilson test | ||||
| @@ -391,17 +307,6 @@ int main (int argc, char ** argv) | ||||
|     RealD M5  =0.8; | ||||
|     DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,GRID,RBGRID,mass,M5); | ||||
|  | ||||
|     /////////////////// Test code for (1-m)^2 /////////////// | ||||
|     LatticePropagatorD prop1(&GRID); | ||||
|     LatticePropagatorD prop2(&GRID); | ||||
|     LatticeComplexD ratio(&GRID); | ||||
|     MomentumSpacePropagatorTest(0.0,M5,prop1); | ||||
|     MomentumSpacePropagatorTest(0.3,M5,prop2); | ||||
|     ratio=localNorm2(prop2); | ||||
|     ratio=ratio/localNorm2(prop1); | ||||
|     std::cout << ratio; | ||||
|     /////////////////// Test code for (1-m)^2 factor /////////////// | ||||
|  | ||||
|     // Momentum space prop | ||||
|     std::cout << " Solving by FFT and Feynman rules" <<std::endl; | ||||
|     bool fiveD = false; //calculate 4d free propagator | ||||
|   | ||||
| @@ -73,11 +73,11 @@ int main (int argc, char ** argv) | ||||
|   LatticeColourMatrix   xform2(&GRID); // Gauge xform | ||||
|   LatticeColourMatrix   xform3(&GRID); // Gauge xform | ||||
|    | ||||
|   SU3::ColdConfiguration(pRNG,Umu); // Unit gauge | ||||
|   SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge | ||||
|   Uorg=Umu; | ||||
|   Urnd=Umu; | ||||
|  | ||||
|   SU3::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge | ||||
|   SU<Nc>::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge | ||||
|  | ||||
|   Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu); | ||||
|   std::cout << " Initial plaquette "<<plaq << std::endl; | ||||
| @@ -121,7 +121,7 @@ int main (int argc, char ** argv) | ||||
|   std::cout<< "* Testing non-unit configuration                                *" <<std::endl; | ||||
|   std::cout<< "*****************************************************************" <<std::endl; | ||||
|  | ||||
|   SU3::HotConfiguration(pRNG,Umu); // Unit gauge | ||||
|   SU<Nc>::HotConfiguration(pRNG,Umu); // Unit gauge | ||||
|  | ||||
|   plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu); | ||||
|   std::cout << " Initial plaquette "<<plaq << std::endl; | ||||
| @@ -136,7 +136,7 @@ int main (int argc, char ** argv) | ||||
|   std::cout<< "*****************************************************************" <<std::endl; | ||||
|  | ||||
|   Umu=Urnd; | ||||
|   SU3::HotConfiguration(pRNG,Umu); // Unit gauge | ||||
|   SU<Nc>::HotConfiguration(pRNG,Umu); // Unit gauge | ||||
|  | ||||
|   plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu); | ||||
|   std::cout << " Initial plaquette "<<plaq << std::endl; | ||||
|   | ||||
| @@ -114,7 +114,7 @@ int main (int argc, char ** argv) | ||||
|   GridParallelRNG          RNG4_2f(UGrid_2f);  RNG4_2f.SeedFixedIntegers(seeds4); | ||||
|  | ||||
|   GparityGaugeField Umu_2f(UGrid_2f); | ||||
|   SU3::HotConfiguration(RNG4_2f,Umu_2f); | ||||
|   SU<Nc>::HotConfiguration(RNG4_2f,Umu_2f); | ||||
|  | ||||
|   StandardFermionField    src   (FGrid_2f);  | ||||
|   StandardFermionField    tmpsrc(FGrid_2f);  | ||||
|   | ||||
| @@ -61,7 +61,7 @@ int main (int argc, char ** argv) | ||||
|   FermionField    ref(&Grid);    ref=Zero(); | ||||
|   FermionField    tmp(&Grid);    tmp=Zero(); | ||||
|   FermionField    err(&Grid);    tmp=Zero(); | ||||
|   LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); | ||||
|   LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu); | ||||
|   std::vector<LatticeColourMatrix> U(4,&Grid); | ||||
|  | ||||
|   double volume=1; | ||||
|   | ||||
| @@ -66,14 +66,14 @@ int main(int argc, char** argv) { | ||||
|  | ||||
|   std::cout << GridLogMessage << "*********************************************" | ||||
|             << std::endl; | ||||
|   std::cout << GridLogMessage << "* Generators for SU(3)" << std::endl; | ||||
|   std::cout << GridLogMessage << "* Generators for SU(Nc" << std::endl; | ||||
|   std::cout << GridLogMessage << "*********************************************" | ||||
|             << std::endl; | ||||
|   SU3::printGenerators(); | ||||
|   std::cout << "Dimension of adjoint representation: "<< SU3Adjoint::Dimension << std::endl; | ||||
|   SU3Adjoint::printGenerators(); | ||||
|   SU3::testGenerators(); | ||||
|   SU3Adjoint::testGenerators(); | ||||
|   SU<Nc>::printGenerators(); | ||||
|   std::cout << "Dimension of adjoint representation: "<< SU<Nc>Adjoint::Dimension << std::endl; | ||||
|   SU<Nc>Adjoint::printGenerators(); | ||||
|   SU<Nc>::testGenerators(); | ||||
|   SU<Nc>Adjoint::testGenerators(); | ||||
|  | ||||
|   std::cout<<GridLogMessage<<"*********************************************"<<std::endl; | ||||
|   std::cout<<GridLogMessage<<"* Generators for SU(4)"<<std::endl; | ||||
| @@ -87,22 +87,22 @@ int main(int argc, char** argv) { | ||||
|   // Projectors  | ||||
|   GridParallelRNG gridRNG(grid); | ||||
|   gridRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||
|   SU3Adjoint::LatticeAdjMatrix Gauss(grid); | ||||
|   SU3::LatticeAlgebraVector ha(grid); | ||||
|   SU3::LatticeAlgebraVector hb(grid); | ||||
|   SU<Nc>Adjoint::LatticeAdjMatrix Gauss(grid); | ||||
|   SU<Nc>::LatticeAlgebraVector ha(grid); | ||||
|   SU<Nc>::LatticeAlgebraVector hb(grid); | ||||
|   random(gridRNG,Gauss); | ||||
|  | ||||
|   std::cout << GridLogMessage << "Start projectOnAlgebra" << std::endl; | ||||
|   SU3Adjoint::projectOnAlgebra(ha, Gauss); | ||||
|   SU<Nc>Adjoint::projectOnAlgebra(ha, Gauss); | ||||
|   std::cout << GridLogMessage << "end projectOnAlgebra" << std::endl; | ||||
|   std::cout << GridLogMessage << "Start projector" << std::endl; | ||||
|   SU3Adjoint::projector(hb, Gauss); | ||||
|   SU<Nc>Adjoint::projector(hb, Gauss); | ||||
|   std::cout << GridLogMessage << "end projector" << std::endl; | ||||
|  | ||||
|   std::cout << GridLogMessage << "ReStart projector" << std::endl; | ||||
|   SU3Adjoint::projector(hb, Gauss); | ||||
|   SU<Nc>Adjoint::projector(hb, Gauss); | ||||
|   std::cout << GridLogMessage << "end projector" << std::endl; | ||||
|   SU3::LatticeAlgebraVector diff = ha -hb; | ||||
|   SU<Nc>::LatticeAlgebraVector diff = ha -hb; | ||||
|   std::cout << GridLogMessage << "Difference: " << norm2(diff) << std::endl; | ||||
|  | ||||
|  | ||||
| @@ -260,20 +260,20 @@ int main(int argc, char** argv) { | ||||
|   std::cout << GridLogMessage << "Test for the Two Index Symmetric projectors" | ||||
|       << std::endl; | ||||
|   // Projectors  | ||||
|   SU3TwoIndexSymm::LatticeTwoIndexMatrix Gauss2(grid); | ||||
|   SU<Nc>TwoIndexSymm::LatticeTwoIndexMatrix Gauss2(grid); | ||||
|   random(gridRNG,Gauss2); | ||||
|    | ||||
|   std::cout << GridLogMessage << "Start projectOnAlgebra" << std::endl; | ||||
|   SU3TwoIndexSymm::projectOnAlgebra(ha, Gauss2); | ||||
|   SU<Nc>TwoIndexSymm::projectOnAlgebra(ha, Gauss2); | ||||
|   std::cout << GridLogMessage << "end projectOnAlgebra" << std::endl; | ||||
|   std::cout << GridLogMessage << "Start projector" << std::endl; | ||||
|   SU3TwoIndexSymm::projector(hb, Gauss2); | ||||
|   SU<Nc>TwoIndexSymm::projector(hb, Gauss2); | ||||
|   std::cout << GridLogMessage << "end projector" << std::endl; | ||||
|    | ||||
|   std::cout << GridLogMessage << "ReStart projector" << std::endl; | ||||
|   SU3TwoIndexSymm::projector(hb, Gauss2); | ||||
|   SU<Nc>TwoIndexSymm::projector(hb, Gauss2); | ||||
|   std::cout << GridLogMessage << "end projector" << std::endl; | ||||
|   SU3::LatticeAlgebraVector diff2 = ha - hb; | ||||
|   SU<Nc>::LatticeAlgebraVector diff2 = ha - hb; | ||||
|   std::cout << GridLogMessage << "Difference: " << norm2(diff) << std::endl; | ||||
|   std::cout << GridLogMessage << "*********************************************" | ||||
|       << std::endl; | ||||
| @@ -284,20 +284,20 @@ int main(int argc, char** argv) { | ||||
|   std::cout << GridLogMessage << "Test for the Two index anti-Symmetric projectors" | ||||
|       << std::endl; | ||||
|   // Projectors | ||||
|   SU3TwoIndexAntiSymm::LatticeTwoIndexMatrix Gauss2a(grid); | ||||
|   SU<Nc>TwoIndexAntiSymm::LatticeTwoIndexMatrix Gauss2a(grid); | ||||
|   random(gridRNG,Gauss2a); | ||||
|    | ||||
|   std::cout << GridLogMessage << "Start projectOnAlgebra" << std::endl; | ||||
|   SU3TwoIndexAntiSymm::projectOnAlgebra(ha, Gauss2a); | ||||
|   SU<Nc>TwoIndexAntiSymm::projectOnAlgebra(ha, Gauss2a); | ||||
|   std::cout << GridLogMessage << "end projectOnAlgebra" << std::endl; | ||||
|   std::cout << GridLogMessage << "Start projector" << std::endl; | ||||
|   SU3TwoIndexAntiSymm::projector(hb, Gauss2a); | ||||
|   SU<Nc>TwoIndexAntiSymm::projector(hb, Gauss2a); | ||||
|   std::cout << GridLogMessage << "end projector" << std::endl; | ||||
|    | ||||
|   std::cout << GridLogMessage << "ReStart projector" << std::endl; | ||||
|   SU3TwoIndexAntiSymm::projector(hb, Gauss2a); | ||||
|   SU<Nc>TwoIndexAntiSymm::projector(hb, Gauss2a); | ||||
|   std::cout << GridLogMessage << "end projector" << std::endl; | ||||
|   SU3::LatticeAlgebraVector diff2a = ha - hb; | ||||
|   SU<Nc>::LatticeAlgebraVector diff2a = ha - hb; | ||||
|   std::cout << GridLogMessage << "Difference: " << norm2(diff2a) << std::endl; | ||||
|   std::cout << GridLogMessage << "*********************************************" | ||||
|       << std::endl; | ||||
|   | ||||
| @@ -444,7 +444,7 @@ int main(int argc, char **argv) { | ||||
|       // Lattice 12x12 GEMM | ||||
|       scFooBar = scFoo * scBar; | ||||
|  | ||||
|       // Benchmark some simple operations LatticeSU3 * Lattice SU3. | ||||
|       // Benchmark some simple operations LatticeSU<Nc> * Lattice SU<Nc>. | ||||
|       double t0, t1, flops; | ||||
|       double bytes; | ||||
|       int ncall = 5000; | ||||
|   | ||||
| @@ -73,7 +73,7 @@ int main (int argc, char ** argv) | ||||
|     LatticeFermion    ref   (FGrid); ref = Zero(); | ||||
|     LatticeFermion    tmp   (FGrid); tmp = Zero(); | ||||
|     LatticeFermion    err   (FGrid); err = Zero(); | ||||
|     LatticeGaugeField Umu   (UGrid); SU3::HotConfiguration(RNG4, Umu); | ||||
|     LatticeGaugeField Umu   (UGrid); SU<Nc>::HotConfiguration(RNG4, Umu); | ||||
|     std::vector<LatticeColourMatrix> U(4,UGrid); | ||||
|  | ||||
|     // Only one non-zero (y) | ||||
|   | ||||
| @@ -55,7 +55,7 @@ int main (int argc, char ** argv) | ||||
|   GridParallelRNG  pRNG(grid); pRNG.SeedFixedIntegers(pseeds); | ||||
|   GridSerialRNG    sRNG;       sRNG.SeedFixedIntegers(sseeds); | ||||
|  | ||||
|   // SU3 colour operatoions | ||||
|   // SU<Nc> colour operatoions | ||||
|   LatticeColourMatrix link(grid); | ||||
|   LatticeColourMatrix staple(grid); | ||||
|  | ||||
| @@ -87,10 +87,10 @@ int main (int argc, char ** argv) | ||||
|  | ||||
| 	link = PeekIndex<LorentzIndex>(Umu,mu); | ||||
|  | ||||
| 	for( int subgroup=0;subgroup<SU3::su2subgroups();subgroup++ ) { | ||||
| 	for( int subgroup=0;subgroup<SU<Nc>::su2subgroups();subgroup++ ) { | ||||
|  | ||||
| 	  // update Even checkerboard | ||||
| 	  SU3::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask); | ||||
| 	  SU<Nc>::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask); | ||||
|  | ||||
| 	} | ||||
|  | ||||
|   | ||||
| @@ -64,7 +64,7 @@ int main (int argc, char ** argv) | ||||
|   FermionField    err(&Grid);    tmp=Zero(); | ||||
|   FermionField phi   (&Grid); random(pRNG,phi); | ||||
|   FermionField chi   (&Grid); random(pRNG,chi); | ||||
|   LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); | ||||
|   LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu); | ||||
|   std::vector<LatticeColourMatrix> U(4,&Grid); | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -75,7 +75,7 @@ int main (int argc, char ** argv) | ||||
|   FermionField phi   (FGrid); random(pRNG5,phi); | ||||
|   FermionField chi   (FGrid); random(pRNG5,chi); | ||||
|  | ||||
|   LatticeGaugeField Umu(UGrid); SU3::ColdConfiguration(pRNG4,Umu); | ||||
|   LatticeGaugeField Umu(UGrid); SU<Nc>::ColdConfiguration(pRNG4,Umu); | ||||
|   LatticeGaugeField Umua(UGrid); Umua=Umu; | ||||
|  | ||||
|   double volume=Ls; | ||||
|   | ||||
| @@ -84,7 +84,7 @@ int main (int argc, char ** argv) | ||||
|   FermionField chi   (FGrid); random(pRNG5,chi); | ||||
|  | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|   SU3::HotConfiguration(pRNG4,Umu); | ||||
|   SU<Nc>::HotConfiguration(pRNG4,Umu); | ||||
|  | ||||
|   /* | ||||
|   for(int mu=1;mu<4;mu++){ | ||||
|   | ||||
| @@ -83,7 +83,7 @@ int main (int argc, char ** argv) | ||||
|   FermionField chi   (FGrid); random(pRNG5,chi); | ||||
|  | ||||
|   LatticeGaugeFieldF Umu(UGrid); | ||||
|   SU3::HotConfiguration(pRNG4,Umu); | ||||
|   SU<Nc>::HotConfiguration(pRNG4,Umu); | ||||
|  | ||||
|   /* | ||||
|   for(int mu=1;mu<4;mu++){ | ||||
|   | ||||
| @@ -64,7 +64,7 @@ int main (int argc, char ** argv) | ||||
|   FermionField    err(&Grid);    tmp=Zero(); | ||||
|   FermionField phi   (&Grid); random(pRNG,phi); | ||||
|   FermionField chi   (&Grid); random(pRNG,chi); | ||||
|   LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); | ||||
|   LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu); | ||||
|   std::vector<LatticeColourMatrix> U(4,&Grid); | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -74,7 +74,7 @@ int main(int argc, char **argv) | ||||
|   FermionField chi(&Grid); | ||||
|   random(pRNG, chi); | ||||
|   LatticeGaugeField Umu(&Grid); | ||||
|   SU3::HotConfiguration(pRNG, Umu); | ||||
|   SU<Nc>::HotConfiguration(pRNG, Umu); | ||||
|   std::vector<LatticeColourMatrix> U(4, &Grid); | ||||
|  | ||||
|   double volume = 1; | ||||
|   | ||||
| @@ -70,7 +70,7 @@ int main (int argc, char ** argv) | ||||
|   LatticeFermion    tmp(&Grid);    tmp=Zero(); | ||||
|   LatticeFermion    err(&Grid);    tmp=Zero(); | ||||
|   LatticeGaugeField Umu(&Grid);  | ||||
|   SU3::HotConfiguration(pRNG,Umu); | ||||
|   SU<Nc>::HotConfiguration(pRNG,Umu); | ||||
|   std::vector<LatticeColourMatrix> U(4,&Grid); | ||||
|  | ||||
|   double volume=1; | ||||
|   | ||||
| @@ -71,7 +71,7 @@ int main (int argc, char ** argv) | ||||
|   LatticeFermion    ref(&Grid);    ref=Zero(); | ||||
|   LatticeFermion    tmp(&Grid);    tmp=Zero(); | ||||
|   LatticeFermion    err(&Grid);    tmp=Zero(); | ||||
|   LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); | ||||
|   LatticeGaugeField Umu(&Grid); SU<Nc>::HotConfiguration(pRNG,Umu); | ||||
|   std::vector<LatticeColourMatrix> U(4,&Grid); | ||||
|  | ||||
|   double volume=1; | ||||
|   | ||||
| @@ -116,7 +116,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|   LatticeGaugeFieldF UmuF(UGridF); | ||||
|   SU3::HotConfiguration(RNG4,Umu); | ||||
|   SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|   precisionChange(UmuF,Umu); | ||||
|   std::vector<LatticeColourMatrix> U(4,UGrid); | ||||
|  | ||||
|   | ||||
| @@ -77,7 +77,7 @@ int main (int argc, char ** argv) | ||||
|   LatticeFermion    ref(FGrid); ref=Zero(); | ||||
|   LatticeFermion    tmp(FGrid); | ||||
|   LatticeFermion    err(FGrid); | ||||
|   LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); | ||||
|   LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|  | ||||
| #if 0 | ||||
|   std::vector<LatticeColourMatrix> U(4,UGrid); | ||||
|   | ||||
| @@ -70,7 +70,7 @@ int main (int argc, char ** argv) | ||||
|   GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5); | ||||
|   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); | ||||
|  | ||||
|   LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); | ||||
|   LatticeGaugeField Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|   std::vector<LatticeColourMatrix> U(4,UGrid); | ||||
|  | ||||
|   RealD mass=0.1; | ||||
|   | ||||
| @@ -71,9 +71,9 @@ int main (int argc, char ** argv) | ||||
|   std::string file("./ckpoint_lat.400"); | ||||
|   NerscIO::readConfiguration(Umu,header,file); | ||||
|  | ||||
|   //  SU3::ColdConfiguration(RNG4,Umu); | ||||
|   //  SU3::TepidConfiguration(RNG4,Umu); | ||||
|   //  SU3::HotConfiguration(RNG4,Umu); | ||||
|   //  SU<Nc>::ColdConfiguration(RNG4,Umu); | ||||
|   //  SU<Nc>::TepidConfiguration(RNG4,Umu); | ||||
|   //  SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|   //  Umu=Zero(); | ||||
|  | ||||
|   RealD mass=0.1; | ||||
|   | ||||
| @@ -108,8 +108,8 @@ int main (int argc, char ** argv) | ||||
|   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); | ||||
|  | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|   SU3::ColdConfiguration(Umu); | ||||
|   //  SU3::HotConfiguration(RNG4,Umu); | ||||
|   SU<Nc>::ColdConfiguration(Umu); | ||||
|   //  SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|  | ||||
|   RealD mass=0.3; | ||||
|   RealD M5  =1.0; | ||||
|   | ||||
| @@ -73,7 +73,7 @@ int main(int argc, char** argv) | ||||
|  | ||||
|   // Random gauge field | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|   SU3::HotConfiguration(RNG4, Umu); | ||||
|   SU<Nc>::HotConfiguration(RNG4, Umu); | ||||
|  | ||||
|   DomainWallEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf,  mf, mpv,  0.0, -1, M5); | ||||
|   DomainWallEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0,  1, M5); | ||||
|   | ||||
| @@ -77,7 +77,7 @@ int main(int argc, char** argv) | ||||
|  | ||||
|   // Random gauge field | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|   SU3::HotConfiguration(RNG4, Umu); | ||||
|   SU<Nc>::HotConfiguration(RNG4, Umu); | ||||
|  | ||||
|   // GparityDomainWallFermionR::ImplParams params; | ||||
|   FermionAction::ImplParams params; | ||||
|   | ||||
| @@ -75,7 +75,7 @@ int main(int argc, char** argv) | ||||
|  | ||||
|   // Random gauge field | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|   SU3::HotConfiguration(RNG4, Umu); | ||||
|   SU<Nc>::HotConfiguration(RNG4, Umu); | ||||
|  | ||||
|   MobiusEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf,  mf, mpv,  0.0, -1, M5, b, c); | ||||
|   MobiusEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0,  1, M5, b, c); | ||||
|   | ||||
| @@ -79,7 +79,7 @@ int main(int argc, char** argv) | ||||
|  | ||||
|   // Random gauge field | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|   SU3::HotConfiguration(RNG4, Umu); | ||||
|   SU<Nc>::HotConfiguration(RNG4, Umu); | ||||
|  | ||||
|   FermionAction::ImplParams params; | ||||
|   FermionAction Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf,  mf, mpv,  0.0, -1, M5, b, c, params); | ||||
|   | ||||
| @@ -102,7 +102,7 @@ int main(int argc, char **argv) | ||||
|  | ||||
|   // Random gauge field | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|   SU3::HotConfiguration(RNG4, Umu); | ||||
|   SU<Nc>::HotConfiguration(RNG4, Umu); | ||||
|  | ||||
|   // Initialize RHMC fermion operators | ||||
|   DomainWallFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5); | ||||
|   | ||||
| @@ -104,7 +104,7 @@ int main(int argc, char **argv) | ||||
|  | ||||
|   // Random gauge field | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|   SU3::HotConfiguration(RNG4, Umu); | ||||
|   SU<Nc>::HotConfiguration(RNG4, Umu); | ||||
|  | ||||
|   // Initialize RHMC fermion operators | ||||
|   GparityDomainWallFermionR::ImplParams params; | ||||
|   | ||||
| @@ -104,7 +104,7 @@ int main(int argc, char **argv) | ||||
|  | ||||
|   // Random gauge field | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|   SU3::HotConfiguration(RNG4, Umu); | ||||
|   SU<Nc>::HotConfiguration(RNG4, Umu); | ||||
|  | ||||
|   // Initialize RHMC fermion operators | ||||
|   MobiusFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5, b, c); | ||||
|   | ||||
| @@ -106,7 +106,7 @@ int main(int argc, char **argv) | ||||
|  | ||||
|   // Random gauge field | ||||
|   LatticeGaugeField Umu(UGrid); | ||||
|   SU3::HotConfiguration(RNG4, Umu); | ||||
|   SU<Nc>::HotConfiguration(RNG4, Umu); | ||||
|  | ||||
|   // Initialize RHMC fermion operators | ||||
|   GparityDomainWallFermionR::ImplParams params; | ||||
|   | ||||
| @@ -59,7 +59,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|   LatticeGaugeField U(UGrid); | ||||
|  | ||||
|   SU3::HotConfiguration(RNG4,U); | ||||
|   SU<Nc>::HotConfiguration(RNG4,U); | ||||
|    | ||||
|   //////////////////////////////////// | ||||
|   // Unmodified matrix element | ||||
| @@ -93,7 +93,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|   for(int mu=0;mu<Nd;mu++){ | ||||
|  | ||||
|     SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg | ||||
|     SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg | ||||
|  | ||||
|     PokeIndex<LorentzIndex>(mom,mommu,mu); | ||||
|  | ||||
|   | ||||
| @@ -60,7 +60,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|   LatticeGaugeField U(UGrid); | ||||
|  | ||||
|   SU3::HotConfiguration(RNG4,U); | ||||
|   SU<Nc>::HotConfiguration(RNG4,U); | ||||
|    | ||||
|   //////////////////////////////////// | ||||
|   // Unmodified matrix element | ||||
| @@ -94,7 +94,7 @@ int main (int argc, char ** argv) | ||||
|  | ||||
|   for(int mu=0;mu<Nd;mu++){ | ||||
|  | ||||
|     SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg | ||||
|     SU<Nc>::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg | ||||
|  | ||||
|     PokeIndex<LorentzIndex>(mom,mommu,mu); | ||||
|  | ||||
|   | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user