mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-26 09:39:34 +00:00 
			
		
		
		
	Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv
This commit is contained in:
		| @@ -168,6 +168,7 @@ public: | |||||||
|   template<class vobj> |   template<class vobj> | ||||||
|   void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){ |   void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){ | ||||||
| #ifndef HAVE_FFTW | #ifndef HAVE_FFTW | ||||||
|  |     std::cerr << "FFTW is not compiled but is called"<<std::endl; | ||||||
|     assert(0); |     assert(0); | ||||||
| #else | #else | ||||||
|     conformable(result.Grid(),vgrid); |     conformable(result.Grid(),vgrid); | ||||||
| @@ -190,7 +191,8 @@ public: | |||||||
|        |        | ||||||
|     Lattice<sobj> pgbuf(&pencil_g); |     Lattice<sobj> pgbuf(&pencil_g); | ||||||
|     autoView(pgbuf_v , pgbuf, CpuWrite); |     autoView(pgbuf_v , pgbuf, CpuWrite); | ||||||
|  |     std::cout << "CPU view" << std::endl; | ||||||
|  |      | ||||||
|     typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; |     typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; | ||||||
|     typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan; |     typedef typename FFTW<scalar>::FFTW_plan   FFTW_plan; | ||||||
|        |        | ||||||
| @@ -213,6 +215,7 @@ public: | |||||||
|     else if ( sign == forward ) div = 1.0; |     else if ( sign == forward ) div = 1.0; | ||||||
|     else assert(0); |     else assert(0); | ||||||
|        |        | ||||||
|  |     std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl; | ||||||
|     FFTW_plan p; |     FFTW_plan p; | ||||||
|     { |     { | ||||||
|       FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; |       FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; | ||||||
| @@ -226,6 +229,7 @@ public: | |||||||
|     } |     } | ||||||
|        |        | ||||||
|     // Barrel shift and collect global pencil |     // Barrel shift and collect global pencil | ||||||
|  |     std::cout << GridLogPerformance<<"Making pencil" << std::endl; | ||||||
|     Coordinate lcoor(Nd), gcoor(Nd); |     Coordinate lcoor(Nd), gcoor(Nd); | ||||||
|     result = source; |     result = source; | ||||||
|     int pc = processor_coor[dim]; |     int pc = processor_coor[dim]; | ||||||
| @@ -247,6 +251,7 @@ public: | |||||||
|       } |       } | ||||||
|     } |     } | ||||||
|        |        | ||||||
|  |     std::cout <<GridLogPerformance<< "Looping orthog" << std::endl; | ||||||
|     // Loop over orthog coords |     // Loop over orthog coords | ||||||
|     int NN=pencil_g.lSites(); |     int NN=pencil_g.lSites(); | ||||||
|     GridStopWatch timer; |     GridStopWatch timer; | ||||||
| @@ -269,6 +274,7 @@ public: | |||||||
|     usec += timer.useconds(); |     usec += timer.useconds(); | ||||||
|     flops+= flops_call*NN; |     flops+= flops_call*NN; | ||||||
|        |        | ||||||
|  |     std::cout <<GridLogPerformance<< "Writing back results " << std::endl; | ||||||
|     // writing out result |     // writing out result | ||||||
|     { |     { | ||||||
|       autoView(pgbuf_v,pgbuf,CpuRead); |       autoView(pgbuf_v,pgbuf,CpuRead); | ||||||
| @@ -285,6 +291,7 @@ public: | |||||||
|     } |     } | ||||||
|     result = result*div; |     result = result*div; | ||||||
|        |        | ||||||
|  |     std::cout <<GridLogPerformance<< "Destroying plan " << std::endl; | ||||||
|     // destroying plan |     // destroying plan | ||||||
|     FFTW<scalar>::fftw_destroy_plan(p); |     FFTW<scalar>::fftw_destroy_plan(p); | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid); | |||||||
|   typedef cublasHandle_t gridblasHandle_t; |   typedef cublasHandle_t gridblasHandle_t; | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|   typedef cl::sycl::queue *gridblasHandle_t; |   typedef sycl::queue *gridblasHandle_t; | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_ONE_MKL | #ifdef GRID_ONE_MKL | ||||||
|   typedef cl::sycl::queue *gridblasHandle_t; |   typedef sycl::queue *gridblasHandle_t; | ||||||
| #endif | #endif | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) | #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) | ||||||
|   typedef int32_t gridblasHandle_t; |   typedef int32_t gridblasHandle_t; | ||||||
| @@ -89,9 +89,9 @@ public: | |||||||
|       gridblasHandle = theGridAccelerator; |       gridblasHandle = theGridAccelerator; | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_ONE_MKL | #ifdef GRID_ONE_MKL | ||||||
|       cl::sycl::gpu_selector selector; |       sycl::gpu_selector selector; | ||||||
|       cl::sycl::device selectedDevice { selector }; |       sycl::device selectedDevice { selector }; | ||||||
|       cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()}; |       sycl::property_list q_prop{sycl::property::queue::in_order()}; | ||||||
|       gridblasHandle =new sycl::queue (selectedDevice,q_prop); |       gridblasHandle =new sycl::queue (selectedDevice,q_prop); | ||||||
| #endif | #endif | ||||||
|       gridblasInit=1; |       gridblasInit=1; | ||||||
|   | |||||||
| @@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid); | |||||||
|       //Compute double precision rsd and also new RHS vector. |       //Compute double precision rsd and also new RHS vector. | ||||||
|       Linop_d.HermOp(sol_d, tmp_d); |       Linop_d.HermOp(sol_d, tmp_d); | ||||||
|       RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector |       RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector | ||||||
|        |       std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl; | ||||||
|       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; |       std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; | ||||||
|  |  | ||||||
|       if(norm < OuterLoopNormMult * stop){ |       if(norm < OuterLoopNormMult * stop){ | ||||||
| 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl; | 	std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl; | ||||||
| 	break; | 	break; | ||||||
|       } |       } | ||||||
|       while(norm * inner_tol * inner_tol < stop) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ?? |       while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2;  // inner_tol = sqrt(stop/norm) ?? | ||||||
|  |  | ||||||
|       PrecChangeTimer.Start(); |       PrecChangeTimer.Start(); | ||||||
|       precisionChange(src_f, src_d, pc_wk_dp_to_sp); |       precisionChange(src_f, src_d, pc_wk_dp_to_sp); | ||||||
|   | |||||||
| @@ -102,11 +102,11 @@ public: | |||||||
|     assert(mass.size()==nshift); |     assert(mass.size()==nshift); | ||||||
|     assert(mresidual.size()==nshift); |     assert(mresidual.size()==nshift); | ||||||
|    |    | ||||||
|     // dynamic sized arrays on stack; 2d is a pain with vector |     // remove dynamic sized arrays on stack; 2d is a pain with vector | ||||||
|     RealD  bs[nshift]; |     std::vector<RealD>  bs(nshift); | ||||||
|     RealD  rsq[nshift]; |     std::vector<RealD>  rsq(nshift); | ||||||
|     RealD  z[nshift][2]; |     std::vector<std::array<RealD,2> >  z(nshift); | ||||||
|     int     converged[nshift]; |     std::vector<int>     converged(nshift); | ||||||
|    |    | ||||||
|     const int       primary =0; |     const int       primary =0; | ||||||
|    |    | ||||||
|   | |||||||
| @@ -123,11 +123,11 @@ public: | |||||||
|     assert(mresidual.size()==nshift); |     assert(mresidual.size()==nshift); | ||||||
|    |    | ||||||
|     // dynamic sized arrays on stack; 2d is a pain with vector |     // dynamic sized arrays on stack; 2d is a pain with vector | ||||||
|     RealD  bs[nshift]; |     std::vector<RealD>  bs(nshift); | ||||||
|     RealD  rsq[nshift]; |     std::vector<RealD>  rsq(nshift); | ||||||
|     RealD  rsqf[nshift]; |     std::vector<RealD>  rsqf(nshift); | ||||||
|     RealD  z[nshift][2]; |     std::vector<std::array<RealD,2> >  z(nshift); | ||||||
|     int     converged[nshift]; |     std::vector<int>     converged(nshift); | ||||||
|    |    | ||||||
|     const int       primary =0; |     const int       primary =0; | ||||||
|    |    | ||||||
|   | |||||||
| @@ -156,11 +156,11 @@ public: | |||||||
|     assert(mresidual.size()==nshift); |     assert(mresidual.size()==nshift); | ||||||
|    |    | ||||||
|     // dynamic sized arrays on stack; 2d is a pain with vector |     // dynamic sized arrays on stack; 2d is a pain with vector | ||||||
|     RealD  bs[nshift]; |     std::vector<RealD>  bs(nshift); | ||||||
|     RealD  rsq[nshift]; |     std::vector<RealD>  rsq(nshift); | ||||||
|     RealD  rsqf[nshift]; |     std::vector<RealD>  rsqf(nshift); | ||||||
|     RealD  z[nshift][2]; |     std::vector<std::array<RealD,2> >  z(nshift); | ||||||
|     int     converged[nshift]; |     std::vector<int>     converged(nshift); | ||||||
|    |    | ||||||
|     const int       primary =0; |     const int       primary =0; | ||||||
|    |    | ||||||
|   | |||||||
| @@ -74,7 +74,7 @@ public: | |||||||
|  |  | ||||||
|   void operator() (const Field &src, Field &psi){ |   void operator() (const Field &src, Field &psi){ | ||||||
|  |  | ||||||
|     psi=Zero(); |     //    psi=Zero(); | ||||||
|     RealD cp, ssq,rsq; |     RealD cp, ssq,rsq; | ||||||
|     ssq=norm2(src); |     ssq=norm2(src); | ||||||
|     rsq=Tolerance*Tolerance*ssq; |     rsq=Tolerance*Tolerance*ssq; | ||||||
|   | |||||||
| @@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
| #pragma once | #pragma once | ||||||
|  |  | ||||||
|  | #include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h> | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
| inline RealD AggregatePowerLaw(RealD x) | inline RealD AggregatePowerLaw(RealD x) | ||||||
| @@ -124,6 +126,53 @@ public: | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   virtual void CreateSubspaceGCR(GridParallelRNG  &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis) | ||||||
|  |   { | ||||||
|  |     RealD scale; | ||||||
|  |  | ||||||
|  |     TrivialPrecon<FineField> simple_fine; | ||||||
|  |     PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12); | ||||||
|  |     FineField noise(FineGrid); | ||||||
|  |     FineField src(FineGrid); | ||||||
|  |     FineField guess(FineGrid); | ||||||
|  |     FineField Mn(FineGrid); | ||||||
|  |  | ||||||
|  |     for(int b=0;b<nn;b++){ | ||||||
|  |        | ||||||
|  |       subspace[b] = Zero(); | ||||||
|  |       gaussian(RNG,noise); | ||||||
|  |       scale = std::pow(norm2(noise),-0.5);  | ||||||
|  |       noise=noise*scale; | ||||||
|  |        | ||||||
|  |       DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise   ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl; | ||||||
|  |  | ||||||
|  |       for(int i=0;i<3;i++){ | ||||||
|  | 	//  void operator() (const Field &src, Field &psi){ | ||||||
|  | #if 1 | ||||||
|  | 	std::cout << GridLogMessage << " inverting on noise "<<std::endl; | ||||||
|  | 	src = noise; | ||||||
|  | 	guess=Zero(); | ||||||
|  | 	GCR(src,guess); | ||||||
|  | 	subspace[b] = guess; | ||||||
|  | #else | ||||||
|  | 	std::cout << GridLogMessage << " inverting on zero "<<std::endl; | ||||||
|  | 	src=Zero(); | ||||||
|  | 	guess = noise; | ||||||
|  | 	GCR(src,guess); | ||||||
|  | 	subspace[b] = guess; | ||||||
|  | #endif | ||||||
|  | 	noise = subspace[b]; | ||||||
|  | 	scale = std::pow(norm2(noise),-0.5);  | ||||||
|  | 	noise=noise*scale; | ||||||
|  |  | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl; | ||||||
|  |       subspace[b]   = noise; | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit) |   // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit) | ||||||
|   // and this is the best I found |   // and this is the best I found | ||||||
| @@ -160,14 +209,21 @@ public: | |||||||
|  |  | ||||||
|     int b =0; |     int b =0; | ||||||
|     { |     { | ||||||
|  |       ComplexD ip; | ||||||
|       // Filter |       // Filter | ||||||
|       Chebyshev<FineField> Cheb(lo,hi,orderfilter); |       Chebyshev<FineField> Cheb(lo,hi,orderfilter); | ||||||
|       Cheb(hermop,noise,Mn); |       Cheb(hermop,noise,Mn); | ||||||
|       // normalise |       // normalise | ||||||
|       scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale; |       scale = std::pow(norm2(Mn),-0.5); 	Mn=Mn*scale; | ||||||
|       subspace[b]   = Mn; |       subspace[b]   = Mn; | ||||||
|       hermop.Op(Mn,tmp);  |  | ||||||
|       std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; |       hermop.Op(Mn,tmp); | ||||||
|  |       ip= innerProduct(Mn,tmp);  | ||||||
|  |       std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl; | ||||||
|  |  | ||||||
|  |       hermop.AdjOp(Mn,tmp);  | ||||||
|  |       ip = innerProduct(Mn,tmp);  | ||||||
|  |       std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl; | ||||||
|       b++; |       b++; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -213,8 +269,18 @@ public: | |||||||
| 	  Mn=*Tnp; | 	  Mn=*Tnp; | ||||||
| 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale; | 	  scale = std::pow(norm2(Mn),-0.5);         Mn=Mn*scale; | ||||||
| 	  subspace[b] = Mn; | 	  subspace[b] = Mn; | ||||||
| 	  hermop.Op(Mn,tmp);  |  | ||||||
| 	  std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl; |  | ||||||
|  | 	  ComplexD ip; | ||||||
|  |  | ||||||
|  | 	  hermop.Op(Mn,tmp); | ||||||
|  | 	  ip= innerProduct(Mn,tmp);  | ||||||
|  | 	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl; | ||||||
|  |  | ||||||
|  | 	  hermop.AdjOp(Mn,tmp);  | ||||||
|  | 	  ip = innerProduct(Mn,tmp);  | ||||||
|  | 	  std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl; | ||||||
|  | 	   | ||||||
| 	  b++; | 	  b++; | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|   | |||||||
| @@ -99,7 +99,7 @@ public: | |||||||
|   CoarseMatrix AselfInvEven; |   CoarseMatrix AselfInvEven; | ||||||
|   CoarseMatrix AselfInvOdd; |   CoarseMatrix AselfInvOdd; | ||||||
|  |  | ||||||
|   Vector<RealD> dag_factor; |   deviceVector<RealD> dag_factor; | ||||||
|  |  | ||||||
|   /////////////////////// |   /////////////////////// | ||||||
|   // Interface |   // Interface | ||||||
| @@ -124,9 +124,13 @@ public: | |||||||
|     int npoint = geom.npoint; |     int npoint = geom.npoint; | ||||||
|     typedef LatticeView<Cobj> Aview; |     typedef LatticeView<Cobj> Aview; | ||||||
|        |        | ||||||
|     Vector<Aview> AcceleratorViewContainer; |     deviceVector<Aview> AcceleratorViewContainer(geom.npoint); | ||||||
|  |     hostVector<Aview>   hAcceleratorViewContainer(geom.npoint); | ||||||
|    |    | ||||||
|     for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead)); |     for(int p=0;p<geom.npoint;p++) { | ||||||
|  |       hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead); | ||||||
|  |       acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]); | ||||||
|  |     } | ||||||
|     Aview *Aview_p = & AcceleratorViewContainer[0]; |     Aview *Aview_p = & AcceleratorViewContainer[0]; | ||||||
|  |  | ||||||
|     const int Nsimd = CComplex::Nsimd(); |     const int Nsimd = CComplex::Nsimd(); | ||||||
| @@ -161,7 +165,7 @@ public: | |||||||
|       coalescedWrite(out_v[ss](b),res); |       coalescedWrite(out_v[ss](b),res); | ||||||
|       }); |       }); | ||||||
|  |  | ||||||
|     for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); |     for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose(); | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   void Mdag (const CoarseVector &in, CoarseVector &out) |   void Mdag (const CoarseVector &in, CoarseVector &out) | ||||||
| @@ -190,9 +194,14 @@ public: | |||||||
|     int npoint = geom.npoint; |     int npoint = geom.npoint; | ||||||
|     typedef LatticeView<Cobj> Aview; |     typedef LatticeView<Cobj> Aview; | ||||||
|  |  | ||||||
|     Vector<Aview> AcceleratorViewContainer; |  | ||||||
|  |  | ||||||
|     for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead)); |     deviceVector<Aview> AcceleratorViewContainer(geom.npoint); | ||||||
|  |     hostVector<Aview>   hAcceleratorViewContainer(geom.npoint); | ||||||
|  |    | ||||||
|  |     for(int p=0;p<geom.npoint;p++) { | ||||||
|  |       hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead); | ||||||
|  |       acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]); | ||||||
|  |     } | ||||||
|     Aview *Aview_p = & AcceleratorViewContainer[0]; |     Aview *Aview_p = & AcceleratorViewContainer[0]; | ||||||
|  |  | ||||||
|     const int Nsimd = CComplex::Nsimd(); |     const int Nsimd = CComplex::Nsimd(); | ||||||
| @@ -201,10 +210,10 @@ public: | |||||||
|  |  | ||||||
|     int osites=Grid()->oSites(); |     int osites=Grid()->oSites(); | ||||||
|  |  | ||||||
|     Vector<int> points(geom.npoint, 0); |     deviceVector<int> points(geom.npoint); | ||||||
|     for(int p=0; p<geom.npoint; p++) |     for(int p=0; p<geom.npoint; p++) {  | ||||||
|       points[p] = geom.points_dagger[p]; |       acceleratorPut(points[p],geom.points_dagger[p]); | ||||||
|  |     } | ||||||
|     auto points_p = &points[0]; |     auto points_p = &points[0]; | ||||||
|  |  | ||||||
|     RealD* dag_factor_p = &dag_factor[0]; |     RealD* dag_factor_p = &dag_factor[0]; | ||||||
| @@ -236,7 +245,7 @@ public: | |||||||
|       coalescedWrite(out_v[ss](b),res); |       coalescedWrite(out_v[ss](b),res); | ||||||
|       }); |       }); | ||||||
|  |  | ||||||
|     for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); |     for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void MdirComms(const CoarseVector &in) |   void MdirComms(const CoarseVector &in) | ||||||
| @@ -251,8 +260,14 @@ public: | |||||||
|     out.Checkerboard() = in.Checkerboard(); |     out.Checkerboard() = in.Checkerboard(); | ||||||
|  |  | ||||||
|     typedef LatticeView<Cobj> Aview; |     typedef LatticeView<Cobj> Aview; | ||||||
|     Vector<Aview> AcceleratorViewContainer; |  | ||||||
|     for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead)); |     deviceVector<Aview> AcceleratorViewContainer(geom.npoint); | ||||||
|  |     hostVector<Aview>   hAcceleratorViewContainer(geom.npoint); | ||||||
|  |    | ||||||
|  |     for(int p=0;p<geom.npoint;p++) { | ||||||
|  |       hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead); | ||||||
|  |       acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]); | ||||||
|  |     } | ||||||
|     Aview *Aview_p = & AcceleratorViewContainer[0]; |     Aview *Aview_p = & AcceleratorViewContainer[0]; | ||||||
|  |  | ||||||
|     autoView( out_v , out, AcceleratorWrite); |     autoView( out_v , out, AcceleratorWrite); | ||||||
| @@ -285,7 +300,7 @@ public: | |||||||
|       } |       } | ||||||
|       coalescedWrite(out_v[ss](b),res); |       coalescedWrite(out_v[ss](b),res); | ||||||
|     }); |     }); | ||||||
|     for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); |     for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose(); | ||||||
|   } |   } | ||||||
|   void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out) |   void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out) | ||||||
|   { |   { | ||||||
| @@ -469,14 +484,20 @@ public: | |||||||
|  |  | ||||||
|     // determine in what order we need the points |     // determine in what order we need the points | ||||||
|     int npoint = geom.npoint-1; |     int npoint = geom.npoint-1; | ||||||
|     Vector<int> points(npoint, 0); |     deviceVector<int> points(npoint); | ||||||
|     for(int p=0; p<npoint; p++) |     for(int p=0; p<npoint; p++) { | ||||||
|       points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p; |       int val = (dag && !hermitian) ? geom.points_dagger[p] : p; | ||||||
|  |       acceleratorPut(points[p], val); | ||||||
|  |     } | ||||||
|     auto points_p = &points[0]; |     auto points_p = &points[0]; | ||||||
|  |  | ||||||
|     Vector<Aview> AcceleratorViewContainer; |     deviceVector<Aview> AcceleratorViewContainer(geom.npoint); | ||||||
|     for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead)); |     hostVector<Aview>   hAcceleratorViewContainer(geom.npoint); | ||||||
|  |    | ||||||
|  |     for(int p=0;p<geom.npoint;p++) { | ||||||
|  |       hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead); | ||||||
|  |       acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]); | ||||||
|  |     } | ||||||
|     Aview *Aview_p = & AcceleratorViewContainer[0]; |     Aview *Aview_p = & AcceleratorViewContainer[0]; | ||||||
|  |  | ||||||
|     const int Nsimd = CComplex::Nsimd(); |     const int Nsimd = CComplex::Nsimd(); | ||||||
| @@ -539,7 +560,7 @@ public: | |||||||
|       }); |       }); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose(); |     for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose(); | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	: |   CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) 	: | ||||||
| @@ -590,11 +611,13 @@ public: | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     // GPU readable prefactor |     // GPU readable prefactor | ||||||
|  |     std::vector<RealD> h_dag_factor(nbasis*nbasis); | ||||||
|     thread_for(i, nbasis*nbasis, { |     thread_for(i, nbasis*nbasis, { | ||||||
|       int j = i/nbasis; |       int j = i/nbasis; | ||||||
|       int k = i%nbasis; |       int k = i%nbasis; | ||||||
|       dag_factor[i] = dag_factor_eigen(j, k); |       h_dag_factor[i] = dag_factor_eigen(j, k); | ||||||
|     }); |     }); | ||||||
|  |     acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD)); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop, |   void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop, | ||||||
|   | |||||||
| @@ -174,19 +174,10 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d | |||||||
| //////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////// | ||||||
| // Template typedefs | // Template typedefs | ||||||
| //////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////// | ||||||
| #ifdef ACCELERATOR_CSHIFT | template<class T> using hostVector          = std::vector<T,alignedAllocator<T> >;           // Needs autoview | ||||||
| // Cshift on device | template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // Really want to deprecate | ||||||
| template<class T> using cshiftAllocator = devAllocator<T>; | template<class T> using uvmVector           = std::vector<T,uvmAllocator<T> >;               // auto migrating page | ||||||
| #else | template<class T> using deviceVector        = std::vector<T,devAllocator<T> >;               // device vector | ||||||
| // Cshift on host |  | ||||||
| template<class T> using cshiftAllocator = std::allocator<T>; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;            |  | ||||||
| template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;            |  | ||||||
| template<class T> using commVector    = std::vector<T,devAllocator<T> >; |  | ||||||
| template<class T> using deviceVector  = std::vector<T,devAllocator<T> >; |  | ||||||
| template<class T> using cshiftVector  = std::vector<T,cshiftAllocator<T> >; |  | ||||||
|  |  | ||||||
| /* | /* | ||||||
| template<class T> class vecView | template<class T> class vecView | ||||||
| @@ -197,8 +188,9 @@ template<class T> class vecView | |||||||
|   ViewMode mode; |   ViewMode mode; | ||||||
|   void * cpu_ptr; |   void * cpu_ptr; | ||||||
|  public: |  public: | ||||||
|  |   // Rvalue accessor | ||||||
|   accelerator_inline T & operator[](size_t i) const { return this->data[i]; }; |   accelerator_inline T & operator[](size_t i) const { return this->data[i]; }; | ||||||
|   vecView(std::vector<T> &refer_to_me,ViewMode _mode) |   vecView(Vector<T> &refer_to_me,ViewMode _mode) | ||||||
|   { |   { | ||||||
|     cpu_ptr = &refer_to_me[0]; |     cpu_ptr = &refer_to_me[0]; | ||||||
|     size = refer_to_me.size(); |     size = refer_to_me.size(); | ||||||
| @@ -214,22 +206,12 @@ template<class T> class vecView | |||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode) | template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode) | ||||||
| { | { | ||||||
|   vecView<T> ret(vec,_mode); // does the open |   vecView<T> ret(vec,_mode); // does the open | ||||||
|   return ret;                // must be closed |   return ret;                // must be closed | ||||||
| } | } | ||||||
|  |  | ||||||
| // Little autoscope assister |  | ||||||
| template<class View>  |  | ||||||
| class VectorViewCloser |  | ||||||
| { |  | ||||||
|   View v;  // Take a copy of view and call view close when I go out of scope automatically |  | ||||||
|  public: |  | ||||||
|   VectorViewCloser(View &_v) : v(_v) {}; |  | ||||||
|   ~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose();  MemoryManager::NotifyDeletion(ptr);} |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| #define autoVecView(v_v,v,mode)					\ | #define autoVecView(v_v,v,mode)					\ | ||||||
|   auto v_v = VectorView(v,mode);				\ |   auto v_v = VectorView(v,mode);				\ | ||||||
|   ViewCloser<decltype(v_v)> _autoView##v_v(v_v); |   ViewCloser<decltype(v_v)> _autoView##v_v(v_v); | ||||||
|   | |||||||
| @@ -1,16 +1,15 @@ | |||||||
| #include <Grid/GridCore.h> | #include <Grid/GridCore.h> | ||||||
| #ifndef GRID_UVM | #ifndef GRID_UVM | ||||||
|  |  | ||||||
| #warning "Using explicit device memory copies" |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
| #define MAXLINE 512 | #define MAXLINE 512 | ||||||
| static char print_buffer [ MAXLINE ]; | static char print_buffer [ MAXLINE ]; | ||||||
|  |  | ||||||
| #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; | #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl; | ||||||
| #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer; | #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug  << print_buffer << std::endl; | ||||||
| //#define dprintf(...)  | //#define dprintf(...)  | ||||||
|  | //#define mprintf(...)  | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////// | ||||||
| // For caching copies of data on device | // For caching copies of data on device | ||||||
| @@ -111,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) | |||||||
|   /////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////// | ||||||
|   assert(AccCache.state!=Empty); |   assert(AccCache.state!=Empty); | ||||||
|    |    | ||||||
|   dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);  |   dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);  | ||||||
|   assert(AccCache.accLock==0); |   assert(AccCache.accLock==0); | ||||||
|   assert(AccCache.cpuLock==0); |   assert(AccCache.cpuLock==0); | ||||||
|   assert(AccCache.CpuPtr!=(uint64_t)NULL); |   assert(AccCache.CpuPtr!=(uint64_t)NULL); | ||||||
| @@ -121,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) | |||||||
|     DeviceBytes   -=AccCache.bytes; |     DeviceBytes   -=AccCache.bytes; | ||||||
|     LRUremove(AccCache); |     LRUremove(AccCache); | ||||||
|     AccCache.AccPtr=(uint64_t) NULL; |     AccCache.AccPtr=(uint64_t) NULL; | ||||||
|     dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);   |     dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);   | ||||||
|   } |   } | ||||||
|   uint64_t CpuPtr = AccCache.CpuPtr; |   uint64_t CpuPtr = AccCache.CpuPtr; | ||||||
|   EntryErase(CpuPtr); |   EntryErase(CpuPtr); | ||||||
| @@ -141,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) | |||||||
|   /////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////// | ||||||
|   assert(AccCache.state!=Empty); |   assert(AccCache.state!=Empty); | ||||||
|    |    | ||||||
|   mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n", |   mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld", | ||||||
| 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, | 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, | ||||||
| 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);  | 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);  | ||||||
|   if (AccCache.accLock!=0) return; |   if (AccCache.accLock!=0) return; | ||||||
| @@ -155,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) | |||||||
|     AccCache.AccPtr=(uint64_t)NULL; |     AccCache.AccPtr=(uint64_t)NULL; | ||||||
|     AccCache.state=CpuDirty; // CPU primary now |     AccCache.state=CpuDirty; // CPU primary now | ||||||
|     DeviceBytes   -=AccCache.bytes; |     DeviceBytes   -=AccCache.bytes; | ||||||
|     dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);   |     dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);   | ||||||
|   } |   } | ||||||
|   //  uint64_t CpuPtr = AccCache.CpuPtr; |   //  uint64_t CpuPtr = AccCache.CpuPtr; | ||||||
|   DeviceEvictions++; |   DeviceEvictions++; | ||||||
| @@ -169,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) | |||||||
|   assert(AccCache.AccPtr!=(uint64_t)NULL); |   assert(AccCache.AccPtr!=(uint64_t)NULL); | ||||||
|   assert(AccCache.CpuPtr!=(uint64_t)NULL); |   assert(AccCache.CpuPtr!=(uint64_t)NULL); | ||||||
|   acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); |   acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); | ||||||
|   mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); |   mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); | ||||||
|   DeviceToHostBytes+=AccCache.bytes; |   DeviceToHostBytes+=AccCache.bytes; | ||||||
|   DeviceToHostXfer++; |   DeviceToHostXfer++; | ||||||
|   AccCache.state=Consistent; |   AccCache.state=Consistent; | ||||||
| @@ -184,7 +183,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) | |||||||
|     AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); |     AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); | ||||||
|     DeviceBytes+=AccCache.bytes; |     DeviceBytes+=AccCache.bytes; | ||||||
|   } |   } | ||||||
|   mprintf("MemoryManager: acceleratorCopyToDevice   Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); |   mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx", | ||||||
|  | 	  (uint64_t)AccCache.bytes, | ||||||
|  | 	  (uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); | ||||||
|   acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); |   acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); | ||||||
|   HostToDeviceBytes+=AccCache.bytes; |   HostToDeviceBytes+=AccCache.bytes; | ||||||
|   HostToDeviceXfer++; |   HostToDeviceXfer++; | ||||||
| @@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) | |||||||
| void MemoryManager::ViewClose(void* Ptr,ViewMode mode) | void MemoryManager::ViewClose(void* Ptr,ViewMode mode) | ||||||
| { | { | ||||||
|   if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ |   if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ | ||||||
|     dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr); |     dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr); | ||||||
|     AcceleratorViewClose((uint64_t)Ptr); |     AcceleratorViewClose((uint64_t)Ptr); | ||||||
|   } else if( (mode==CpuRead)||(mode==CpuWrite)){ |   } else if( (mode==CpuRead)||(mode==CpuWrite)){ | ||||||
|     CpuViewClose((uint64_t)Ptr); |     CpuViewClose((uint64_t)Ptr); | ||||||
| @@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis | |||||||
| { | { | ||||||
|   uint64_t CpuPtr = (uint64_t)_CpuPtr; |   uint64_t CpuPtr = (uint64_t)_CpuPtr; | ||||||
|   if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ |   if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ | ||||||
|     dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr); |     dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr); | ||||||
|     return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); |     return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); | ||||||
|   } else if( (mode==CpuRead)||(mode==CpuWrite)){ |   } else if( (mode==CpuRead)||(mode==CpuWrite)){ | ||||||
|     return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); |     return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); | ||||||
| @@ -265,7 +266,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod | |||||||
|   assert(AccCache.cpuLock==0);  // Programming error |   assert(AccCache.cpuLock==0);  // Programming error | ||||||
|  |  | ||||||
|   if(AccCache.state!=Empty) { |   if(AccCache.state!=Empty) { | ||||||
|     dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n", |     dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld", | ||||||
| 		    (uint64_t)AccCache.CpuPtr, | 		    (uint64_t)AccCache.CpuPtr, | ||||||
| 		    (uint64_t)CpuPtr, | 		    (uint64_t)CpuPtr, | ||||||
| 		    (uint64_t)AccCache.bytes, | 		    (uint64_t)AccCache.bytes, | ||||||
| @@ -305,7 +306,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod | |||||||
|       AccCache.state  = Consistent; // Empty + AccRead => Consistent |       AccCache.state  = Consistent; // Empty + AccRead => Consistent | ||||||
|     } |     } | ||||||
|     AccCache.accLock= 1; |     AccCache.accLock= 1; | ||||||
|     dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock); |     dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock); | ||||||
|   } else if(AccCache.state==CpuDirty ){ |   } else if(AccCache.state==CpuDirty ){ | ||||||
|     if(mode==AcceleratorWriteDiscard) { |     if(mode==AcceleratorWriteDiscard) { | ||||||
|       CpuDiscard(AccCache); |       CpuDiscard(AccCache); | ||||||
| @@ -318,21 +319,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod | |||||||
|       AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent |       AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent | ||||||
|     } |     } | ||||||
|     AccCache.accLock++; |     AccCache.accLock++; | ||||||
|     dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock); |     dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock); | ||||||
|   } else if(AccCache.state==Consistent) { |   } else if(AccCache.state==Consistent) { | ||||||
|     if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) |     if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) | ||||||
|       AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty |       AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty | ||||||
|     else |     else | ||||||
|       AccCache.state  = Consistent; // Consistent + AccRead => Consistent |       AccCache.state  = Consistent; // Consistent + AccRead => Consistent | ||||||
|     AccCache.accLock++; |     AccCache.accLock++; | ||||||
|     dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock); |     dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock); | ||||||
|   } else if(AccCache.state==AccDirty) { |   } else if(AccCache.state==AccDirty) { | ||||||
|     if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) |     if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) | ||||||
|       AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty |       AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty | ||||||
|     else |     else | ||||||
|       AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty |       AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty | ||||||
|     AccCache.accLock++; |     AccCache.accLock++; | ||||||
|     dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock); |     dprintf("AccDirty entry ++accLock= %d",AccCache.accLock); | ||||||
|   } else { |   } else { | ||||||
|     assert(0); |     assert(0); | ||||||
|   } |   } | ||||||
| @@ -341,7 +342,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod | |||||||
|   // If view is opened on device must remove from LRU |   // If view is opened on device must remove from LRU | ||||||
|   if(AccCache.LRU_valid==1){ |   if(AccCache.LRU_valid==1){ | ||||||
|     // must possibly remove from LRU as now locked on GPU |     // must possibly remove from LRU as now locked on GPU | ||||||
|     dprintf("AccCache entry removed from LRU \n"); |     dprintf("AccCache entry removed from LRU "); | ||||||
|     LRUremove(AccCache); |     LRUremove(AccCache); | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -364,10 +365,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr) | |||||||
|   AccCache.accLock--; |   AccCache.accLock--; | ||||||
|   // Move to LRU queue if not locked and close on device |   // Move to LRU queue if not locked and close on device | ||||||
|   if(AccCache.accLock==0) { |   if(AccCache.accLock==0) { | ||||||
|     dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); |     dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); | ||||||
|     LRUinsert(AccCache); |     LRUinsert(AccCache); | ||||||
|   } else { |   } else { | ||||||
|     dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); |     dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); | ||||||
|   } |   } | ||||||
| } | } | ||||||
| void MemoryManager::CpuViewClose(uint64_t CpuPtr) | void MemoryManager::CpuViewClose(uint64_t CpuPtr) | ||||||
|   | |||||||
| @@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES) | |||||||
|   uint64_t virt_pfn = (uint64_t)Buf / page_size; |   uint64_t virt_pfn = (uint64_t)Buf / page_size; | ||||||
|   off_t offset = sizeof(uint64_t) * virt_pfn; |   off_t offset = sizeof(uint64_t) * virt_pfn; | ||||||
|   uint64_t npages = (BYTES + page_size-1) / page_size; |   uint64_t npages = (BYTES + page_size-1) / page_size; | ||||||
|   uint64_t pagedata[npages]; |   std::vector<uint64_t> pagedata(npages); | ||||||
|   uint64_t ret = lseek(fd, offset, SEEK_SET); |   uint64_t ret = lseek(fd, offset, SEEK_SET); | ||||||
|   assert(ret == offset); |   assert(ret == offset); | ||||||
|   ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); |   ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages); | ||||||
|   assert(ret == sizeof(uint64_t) * npages); |   assert(ret == sizeof(uint64_t) * npages); | ||||||
|   int nhugepages = npages / 512; |   int nhugepages = npages / 512; | ||||||
|   int n4ktotal, nnothuge; |   int n4ktotal, nnothuge; | ||||||
|   | |||||||
| @@ -57,18 +57,29 @@ int                      CartesianCommunicator::ProcessorCount(void)    { return | |||||||
| // very VERY rarely (Log, serial RNG) we need world without a grid | // very VERY rarely (Log, serial RNG) we need world without a grid | ||||||
| //////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|  | #ifdef USE_GRID_REDUCTION | ||||||
|  | void CartesianCommunicator::GlobalSum(ComplexF &c) | ||||||
|  | { | ||||||
|  |   GlobalSumP2P(c); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(ComplexD &c) | ||||||
|  | { | ||||||
|  |   GlobalSumP2P(c); | ||||||
|  | } | ||||||
|  | #else | ||||||
| void CartesianCommunicator::GlobalSum(ComplexF &c) | void CartesianCommunicator::GlobalSum(ComplexF &c) | ||||||
| { | { | ||||||
|   GlobalSumVector((float *)&c,2); |   GlobalSumVector((float *)&c,2); | ||||||
| } | } | ||||||
| void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N) |  | ||||||
| { |  | ||||||
|   GlobalSumVector((float *)c,2*N); |  | ||||||
| } |  | ||||||
| void CartesianCommunicator::GlobalSum(ComplexD &c) | void CartesianCommunicator::GlobalSum(ComplexD &c) | ||||||
| { | { | ||||||
|   GlobalSumVector((double *)&c,2); |   GlobalSumVector((double *)&c,2); | ||||||
| } | } | ||||||
|  | #endif | ||||||
|  | void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N) | ||||||
|  | { | ||||||
|  |   GlobalSumVector((float *)c,2*N); | ||||||
|  | } | ||||||
| void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) | void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) | ||||||
| { | { | ||||||
|   GlobalSumVector((double *)c,2*N); |   GlobalSumVector((double *)c,2*N); | ||||||
|   | |||||||
| @@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| /////////////////////////////////// | /////////////////////////////////// | ||||||
| #include <Grid/communicator/SharedMemory.h> | #include <Grid/communicator/SharedMemory.h> | ||||||
|  |  | ||||||
|  | #define NVLINK_GET | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
| extern bool Stencil_force_mpi ; | extern bool Stencil_force_mpi ; | ||||||
| @@ -136,7 +138,7 @@ public: | |||||||
|     for(int d=0;d<_ndimension;d++){ |     for(int d=0;d<_ndimension;d++){ | ||||||
|       column.resize(_processors[d]); |       column.resize(_processors[d]); | ||||||
|       column[0] = accum; |       column[0] = accum; | ||||||
|       std::vector<CommsRequest_t> list; |       std::vector<MpiCommsRequest_t> list; | ||||||
|       for(int p=1;p<_processors[d];p++){ |       for(int p=1;p<_processors[d];p++){ | ||||||
| 	ShiftedRanks(d,p,source,dest); | 	ShiftedRanks(d,p,source,dest); | ||||||
| 	SendToRecvFromBegin(list, | 	SendToRecvFromBegin(list, | ||||||
| @@ -166,8 +168,8 @@ public: | |||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
|   // Face exchange, buffer swap in translational invariant way |   // Face exchange, buffer swap in translational invariant way | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
|   void CommsComplete(std::vector<CommsRequest_t> &list); |   void CommsComplete(std::vector<MpiCommsRequest_t> &list); | ||||||
|   void SendToRecvFromBegin(std::vector<CommsRequest_t> &list, |   void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list, | ||||||
| 			   void *xmit, | 			   void *xmit, | ||||||
| 			   int dest, | 			   int dest, | ||||||
| 			   void *recv, | 			   void *recv, | ||||||
| @@ -186,6 +188,17 @@ public: | |||||||
| 			       int recv_from_rank,int do_recv, | 			       int recv_from_rank,int do_recv, | ||||||
| 			       int bytes,int dir); | 			       int bytes,int dir); | ||||||
|  |  | ||||||
|  |   double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list, | ||||||
|  | 				      void *xmit, | ||||||
|  | 				      int xmit_to_rank,int do_xmit, | ||||||
|  | 				      void *recv, | ||||||
|  | 				      int recv_from_rank,int do_recv, | ||||||
|  | 				      int xbytes,int rbytes,int dir); | ||||||
|  |  | ||||||
|  |   // Could do a PollHtoD and have a CommsMerge dependence | ||||||
|  |   void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list); | ||||||
|  |   void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list); | ||||||
|  |  | ||||||
|   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, |   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 				    void *xmit, | 				    void *xmit, | ||||||
| 				    int xmit_to_rank,int do_xmit, | 				    int xmit_to_rank,int do_xmit, | ||||||
|   | |||||||
| @@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
|  |  | ||||||
| Grid_MPI_Comm       CartesianCommunicator::communicator_world; | Grid_MPI_Comm       CartesianCommunicator::communicator_world; | ||||||
|  |  | ||||||
| //////////////////////////////////////////// | //////////////////////////////////////////// | ||||||
| @@ -257,6 +258,25 @@ CartesianCommunicator::~CartesianCommunicator() | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  | #ifdef USE_GRID_REDUCTION | ||||||
|  | void CartesianCommunicator::GlobalSum(float &f){ | ||||||
|  |   CartesianCommunicator::GlobalSumP2P(f); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(double &d) | ||||||
|  | { | ||||||
|  |   CartesianCommunicator::GlobalSumP2P(d); | ||||||
|  | } | ||||||
|  | #else | ||||||
|  | void CartesianCommunicator::GlobalSum(float &f){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(double &d) | ||||||
|  | { | ||||||
|  |   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | #endif | ||||||
| void CartesianCommunicator::GlobalSum(uint32_t &u){ | void CartesianCommunicator::GlobalSum(uint32_t &u){ | ||||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
| @@ -287,27 +307,18 @@ void CartesianCommunicator::GlobalMax(double &d) | |||||||
|   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); |   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
| } | } | ||||||
| void CartesianCommunicator::GlobalSum(float &f){ |  | ||||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); |  | ||||||
|   assert(ierr==0); |  | ||||||
| } |  | ||||||
| void CartesianCommunicator::GlobalSumVector(float *f,int N) | void CartesianCommunicator::GlobalSumVector(float *f,int N) | ||||||
| { | { | ||||||
|   int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); |   int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
| } | } | ||||||
| void CartesianCommunicator::GlobalSum(double &d) |  | ||||||
| { |  | ||||||
|   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); |  | ||||||
|   assert(ierr==0); |  | ||||||
| } |  | ||||||
| void CartesianCommunicator::GlobalSumVector(double *d,int N) | void CartesianCommunicator::GlobalSumVector(double *d,int N) | ||||||
| { | { | ||||||
|   int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); |   int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
| } | } | ||||||
|  |  | ||||||
| void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, | void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list, | ||||||
| 						void *xmit, | 						void *xmit, | ||||||
| 						int dest, | 						int dest, | ||||||
| 						void *recv, | 						void *recv, | ||||||
| @@ -332,7 +343,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis | |||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
|   list.push_back(xrq); |   list.push_back(xrq); | ||||||
| } | } | ||||||
| void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list) | void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list) | ||||||
| { | { | ||||||
|   int nreq=list.size(); |   int nreq=list.size(); | ||||||
|  |  | ||||||
| @@ -351,9 +362,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, | |||||||
| 					   int from, | 					   int from, | ||||||
| 					   int bytes) | 					   int bytes) | ||||||
| { | { | ||||||
|   std::vector<CommsRequest_t> reqs(0); |   std::vector<MpiCommsRequest_t> reqs(0); | ||||||
|   unsigned long  xcrc = crc32(0L, Z_NULL, 0); |  | ||||||
|   unsigned long  rcrc = crc32(0L, Z_NULL, 0); |  | ||||||
|  |  | ||||||
|   int myrank = _processor; |   int myrank = _processor; | ||||||
|   int ierr; |   int ierr; | ||||||
| @@ -369,9 +378,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit, | |||||||
| 		    communicator,MPI_STATUS_IGNORE); | 		    communicator,MPI_STATUS_IGNORE); | ||||||
|   assert(ierr==0); |   assert(ierr==0); | ||||||
|  |  | ||||||
|   //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes); |  | ||||||
|   //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes); |  | ||||||
|   //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush |  | ||||||
| } | } | ||||||
| // Basic Halo comms primitive | // Basic Halo comms primitive | ||||||
| double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, | double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, | ||||||
| @@ -381,12 +387,278 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, | |||||||
| 						     int bytes,int dir) | 						     int bytes,int dir) | ||||||
| { | { | ||||||
|   std::vector<CommsRequest_t> list; |   std::vector<CommsRequest_t> list; | ||||||
|   double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); |   double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); | ||||||
|  |   offbytes       += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); | ||||||
|   StencilSendToRecvFromComplete(list,dir); |   StencilSendToRecvFromComplete(list,dir); | ||||||
|   return offbytes; |   return offbytes; | ||||||
| } | } | ||||||
|  |  | ||||||
| #undef NVLINK_GET // Define to use get instead of put DMA |  | ||||||
|  | #ifdef ACCELERATOR_AWARE_MPI | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {}; | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {}; | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list, | ||||||
|  | 							   void *xmit, | ||||||
|  | 							   int dest,int dox, | ||||||
|  | 							   void *recv, | ||||||
|  | 							   int from,int dor, | ||||||
|  | 							   int xbytes,int rbytes,int dir) | ||||||
|  | { | ||||||
|  |   return 0.0; // Do nothing -- no preparation required | ||||||
|  | } | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  | 							 void *xmit, | ||||||
|  | 							 int dest,int dox, | ||||||
|  | 							 void *recv, | ||||||
|  | 							 int from,int dor, | ||||||
|  | 							 int xbytes,int rbytes,int dir) | ||||||
|  | { | ||||||
|  |   int ncomm  =communicator_halo.size(); | ||||||
|  |   int commdir=dir%ncomm; | ||||||
|  |  | ||||||
|  |   MPI_Request xrq; | ||||||
|  |   MPI_Request rrq; | ||||||
|  |  | ||||||
|  |   int ierr; | ||||||
|  |   int gdest = ShmRanks[dest]; | ||||||
|  |   int gfrom = ShmRanks[from]; | ||||||
|  |   int gme   = ShmRanks[_processor]; | ||||||
|  |  | ||||||
|  |   assert(dest != _processor); | ||||||
|  |   assert(from != _processor); | ||||||
|  |   assert(gme  == ShmRank); | ||||||
|  |   double off_node_bytes=0.0; | ||||||
|  |   int tag; | ||||||
|  |    | ||||||
|  |   if ( dor ) { | ||||||
|  |     if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { | ||||||
|  |       tag= dir+from*32; | ||||||
|  |       ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); | ||||||
|  |       assert(ierr==0); | ||||||
|  |       list.push_back(rrq); | ||||||
|  |       off_node_bytes+=rbytes; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   if (dox) { | ||||||
|  |     if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { | ||||||
|  |       tag= dir+_processor*32; | ||||||
|  |       ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); | ||||||
|  |       assert(ierr==0); | ||||||
|  |       list.push_back(xrq); | ||||||
|  |       off_node_bytes+=xbytes; | ||||||
|  |     } else { | ||||||
|  |       void *shm = (void *) this->ShmBufferTranslate(dest,recv); | ||||||
|  |       assert(shm!=NULL); | ||||||
|  |       acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   return off_node_bytes; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) | ||||||
|  | { | ||||||
|  |   int nreq=list.size(); | ||||||
|  |  | ||||||
|  |   acceleratorCopySynchronise(); | ||||||
|  |  | ||||||
|  |   if (nreq==0) return; | ||||||
|  |   std::vector<MPI_Status> status(nreq); | ||||||
|  |   int ierr = MPI_Waitall(nreq,&list[0],&status[0]); | ||||||
|  |   assert(ierr==0); | ||||||
|  |   list.resize(0); | ||||||
|  |   this->StencilBarrier();  | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #else /* NOT     ... ACCELERATOR_AWARE_MPI */ | ||||||
|  | /////////////////////////////////////////// | ||||||
|  | // Pipeline mode through host memory | ||||||
|  | /////////////////////////////////////////// | ||||||
|  |   /* | ||||||
|  |    * In prepare (phase 1): | ||||||
|  |    * PHASE 1: (prepare) | ||||||
|  |    * - post MPI receive buffers asynch | ||||||
|  |    * - post device - host send buffer transfer asynch | ||||||
|  |    * PHASE 2: (Begin) | ||||||
|  |    * - complete all copies | ||||||
|  |    * - post MPI send asynch | ||||||
|  |    * - post device - device transfers | ||||||
|  |    * PHASE 3: (Complete) | ||||||
|  |    * - MPI_waitall | ||||||
|  |    * - host-device transfers | ||||||
|  |    * | ||||||
|  |    ********************************* | ||||||
|  |    * NB could split this further: | ||||||
|  |    *-------------------------------- | ||||||
|  |    * PHASE 1: (Prepare) | ||||||
|  |    * - post MPI receive buffers asynch | ||||||
|  |    * - post device - host send buffer transfer asynch | ||||||
|  |    * PHASE 2: (BeginInterNode) | ||||||
|  |    * - complete all copies  | ||||||
|  |    * - post MPI send asynch | ||||||
|  |    * PHASE 3: (BeginIntraNode) | ||||||
|  |    * - post device - device transfers | ||||||
|  |    * PHASE 4: (Complete) | ||||||
|  |    * - MPI_waitall | ||||||
|  |    * - host-device transfers asynch | ||||||
|  |    * - (complete all copies)  | ||||||
|  |    */ | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list, | ||||||
|  | 							   void *xmit, | ||||||
|  | 							   int dest,int dox, | ||||||
|  | 							   void *recv, | ||||||
|  | 							   int from,int dor, | ||||||
|  | 							   int xbytes,int rbytes,int dir) | ||||||
|  | { | ||||||
|  | /* | ||||||
|  |  * Bring sequence from Stencil.h down to lower level. | ||||||
|  |  * Assume using XeLink is ok | ||||||
|  |  */   | ||||||
|  |   int ncomm  =communicator_halo.size(); | ||||||
|  |   int commdir=dir%ncomm; | ||||||
|  |  | ||||||
|  |   MPI_Request xrq; | ||||||
|  |   MPI_Request rrq; | ||||||
|  |  | ||||||
|  |   int ierr; | ||||||
|  |   int gdest = ShmRanks[dest]; | ||||||
|  |   int gfrom = ShmRanks[from]; | ||||||
|  |   int gme   = ShmRanks[_processor]; | ||||||
|  |  | ||||||
|  |   assert(dest != _processor); | ||||||
|  |   assert(from != _processor); | ||||||
|  |   assert(gme  == ShmRank); | ||||||
|  |   double off_node_bytes=0.0; | ||||||
|  |   int tag; | ||||||
|  |  | ||||||
|  |   void * host_recv = NULL; | ||||||
|  |   void * host_xmit = NULL; | ||||||
|  |  | ||||||
|  |   /* | ||||||
|  |    * PHASE 1: (Prepare) | ||||||
|  |    * - post MPI receive buffers asynch | ||||||
|  |    * - post device - host send buffer transfer asynch | ||||||
|  |    */ | ||||||
|  |    | ||||||
|  |   if ( dor ) { | ||||||
|  |     if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { | ||||||
|  |       tag= dir+from*32; | ||||||
|  |       host_recv = this->HostBufferMalloc(rbytes); | ||||||
|  |       ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); | ||||||
|  |       assert(ierr==0); | ||||||
|  |       CommsRequest_t srq; | ||||||
|  |       srq.PacketType = InterNodeRecv; | ||||||
|  |       srq.bytes      = rbytes; | ||||||
|  |       srq.req        = rrq; | ||||||
|  |       srq.host_buf   = host_recv; | ||||||
|  |       srq.device_buf = recv; | ||||||
|  |       list.push_back(srq); | ||||||
|  |       off_node_bytes+=rbytes; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   if (dox) { | ||||||
|  |     if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { | ||||||
|  |  | ||||||
|  |       tag= dir+_processor*32; | ||||||
|  |  | ||||||
|  |       host_xmit = this->HostBufferMalloc(xbytes); | ||||||
|  |       CommsRequest_t srq; | ||||||
|  |  | ||||||
|  |       srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch | ||||||
|  |        | ||||||
|  |       //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); | ||||||
|  |       //      assert(ierr==0); | ||||||
|  |       //      off_node_bytes+=xbytes; | ||||||
|  |  | ||||||
|  |       srq.PacketType = InterNodeXmit; | ||||||
|  |       srq.bytes      = xbytes; | ||||||
|  |       //      srq.req        = xrq; | ||||||
|  |       srq.host_buf   = host_xmit; | ||||||
|  |       srq.device_buf = xmit; | ||||||
|  |       srq.tag        = tag; | ||||||
|  |       srq.dest       = dest; | ||||||
|  |       srq.commdir    = commdir; | ||||||
|  |       list.push_back(srq); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   return off_node_bytes; | ||||||
|  | } | ||||||
|  | /* | ||||||
|  |  * In the interest of better pipelining, poll for completion on each DtoH and  | ||||||
|  |  * start MPI_ISend in the meantime | ||||||
|  |  */ | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) | ||||||
|  | { | ||||||
|  |   int pending = 0; | ||||||
|  |   do { | ||||||
|  |  | ||||||
|  |     pending = 0; | ||||||
|  |  | ||||||
|  |     for(int idx = 0; idx<list.size();idx++){ | ||||||
|  |  | ||||||
|  |       if ( list[idx].PacketType==InterNodeRecv ) { | ||||||
|  |  | ||||||
|  | 	int flag = 0; | ||||||
|  | 	MPI_Status status; | ||||||
|  | 	int ierr = MPI_Test(&list[idx].req,&flag,&status); | ||||||
|  | 	assert(ierr==0); | ||||||
|  |  | ||||||
|  | 	if ( flag ) { | ||||||
|  | 	  //	  std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl; | ||||||
|  | 	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes); | ||||||
|  | 	  list[idx].PacketType=InterNodeReceiveHtoD; | ||||||
|  | 	} else { | ||||||
|  | 	  pending ++; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     //    std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl; | ||||||
|  |   } while ( pending ); | ||||||
|  |    | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) | ||||||
|  | { | ||||||
|  |   int pending = 0; | ||||||
|  |   do { | ||||||
|  |  | ||||||
|  |     pending = 0; | ||||||
|  |  | ||||||
|  |     for(int idx = 0; idx<list.size();idx++){ | ||||||
|  |  | ||||||
|  |       if ( list[idx].PacketType==InterNodeXmit ) { | ||||||
|  |  | ||||||
|  | 	if ( acceleratorEventIsComplete(list[idx].ev) ) { | ||||||
|  |  | ||||||
|  | 	  void *host_xmit = list[idx].host_buf; | ||||||
|  | 	  uint32_t xbytes = list[idx].bytes; | ||||||
|  | 	  int dest        = list[idx].dest; | ||||||
|  | 	  int tag         = list[idx].tag; | ||||||
|  | 	  int commdir     = list[idx].commdir; | ||||||
|  | 	  /////////////////// | ||||||
|  | 	  // Send packet | ||||||
|  | 	  /////////////////// | ||||||
|  |  | ||||||
|  | 	  //	  std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl; | ||||||
|  | 	   | ||||||
|  | 	  MPI_Request xrq; | ||||||
|  | 	  int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); | ||||||
|  | 	  assert(ierr==0); | ||||||
|  |  | ||||||
|  | 	  list[idx].req        = xrq; // Update the MPI request in the list | ||||||
|  |  | ||||||
|  | 	  list[idx].PacketType=InterNodeXmitISend; | ||||||
|  |  | ||||||
|  | 	} else { | ||||||
|  | 	  // not done, so return to polling loop | ||||||
|  | 	  pending++; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } while (pending); | ||||||
|  | }   | ||||||
|  |  | ||||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 							 void *xmit, | 							 void *xmit, | ||||||
| 							 int dest,int dox, | 							 int dest,int dox, | ||||||
| @@ -411,54 +683,101 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
|   double off_node_bytes=0.0; |   double off_node_bytes=0.0; | ||||||
|   int tag; |   int tag; | ||||||
|  |  | ||||||
|   if ( dor ) { |   void * host_xmit = NULL; | ||||||
|     if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { |  | ||||||
|       tag= dir+from*32; |   //////////////////////////////// | ||||||
|       ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); |   // Receives already posted | ||||||
|       assert(ierr==0); |   // Copies already started | ||||||
|       list.push_back(rrq); |   //////////////////////////////// | ||||||
|       off_node_bytes+=rbytes; |   /*   | ||||||
|     } |    * PHASE 2: (Begin) | ||||||
|  |    * - complete all copies | ||||||
|  |    * - post MPI send asynch | ||||||
|  |    */ | ||||||
| #ifdef NVLINK_GET | #ifdef NVLINK_GET | ||||||
|  |   if ( dor ) { | ||||||
|  |  | ||||||
|  |     if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) { | ||||||
|  |       // Intranode | ||||||
|       void *shm = (void *) this->ShmBufferTranslate(from,xmit); |       void *shm = (void *) this->ShmBufferTranslate(from,xmit); | ||||||
|       assert(shm!=NULL); |       assert(shm!=NULL); | ||||||
|       acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); |  | ||||||
| #endif |       CommsRequest_t srq; | ||||||
|   } |  | ||||||
|    |       srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); | ||||||
|  |  | ||||||
|  |       srq.PacketType = IntraNodeRecv; | ||||||
|  |       srq.bytes      = xbytes; | ||||||
|  |       //      srq.req        = xrq; | ||||||
|  |       srq.host_buf   = NULL; | ||||||
|  |       srq.device_buf = xmit; | ||||||
|  |       srq.tag        = -1; | ||||||
|  |       srq.dest       = dest; | ||||||
|  |       srq.commdir    = dir; | ||||||
|  |       list.push_back(srq); | ||||||
|  |     } | ||||||
|  |   }   | ||||||
|  | #else | ||||||
|   if (dox) { |   if (dox) { | ||||||
|     //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes); |  | ||||||
|     if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { |     if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) { | ||||||
|       tag= dir+_processor*32; |       // Intranode | ||||||
|       ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); |  | ||||||
|       assert(ierr==0); |  | ||||||
|       list.push_back(xrq); |  | ||||||
|       off_node_bytes+=xbytes; |  | ||||||
|     } else { |  | ||||||
| #ifndef NVLINK_GET |  | ||||||
|       void *shm = (void *) this->ShmBufferTranslate(dest,recv); |       void *shm = (void *) this->ShmBufferTranslate(dest,recv); | ||||||
|       assert(shm!=NULL); |       assert(shm!=NULL); | ||||||
|       acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); |  | ||||||
| #endif |       CommsRequest_t srq; | ||||||
|  |        | ||||||
|  |       srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); | ||||||
|  |  | ||||||
|  |       srq.PacketType = IntraNodeXmit; | ||||||
|  |       srq.bytes      = xbytes; | ||||||
|  |       //      srq.req        = xrq; | ||||||
|  |       srq.host_buf   = NULL; | ||||||
|  |       srq.device_buf = xmit; | ||||||
|  |       srq.tag        = -1; | ||||||
|  |       srq.dest       = dest; | ||||||
|  |       srq.commdir    = dir; | ||||||
|  |       list.push_back(srq); | ||||||
|        |        | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  | #endif | ||||||
|   return off_node_bytes; |   return off_node_bytes; | ||||||
| } | } | ||||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) | ||||||
| { | { | ||||||
|   int nreq=list.size(); |   //  int nreq=list.size(); | ||||||
|  |  | ||||||
|   acceleratorCopySynchronise(); |   //  if (nreq==0) return; | ||||||
|  |   //  std::vector<MPI_Status> status(nreq); | ||||||
|  |   //  std::vector<MPI_Request> MpiRequests(nreq); | ||||||
|  |  | ||||||
|   if (nreq==0) return; |   //  for(int r=0;r<nreq;r++){ | ||||||
|  |   //    MpiRequests[r] = list[r].req; | ||||||
|  |   //  } | ||||||
|  |    | ||||||
|  |   //  int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing. | ||||||
|  |   //  assert(ierr==0); | ||||||
|  |  | ||||||
|   std::vector<MPI_Status> status(nreq); |   //  for(int r=0;r<nreq;r++){ | ||||||
|   int ierr = MPI_Waitall(nreq,&list[0],&status[0]); |   //    if ( list[r].PacketType==InterNodeRecv ) { | ||||||
|   assert(ierr==0); |   //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes); | ||||||
|   list.resize(0); |   //    } | ||||||
|  |   //  } | ||||||
|  |    | ||||||
|  |   acceleratorCopySynchronise(); // Complete all pending copy transfers D2D | ||||||
|  |    | ||||||
|  |   list.resize(0);               // Delete the list | ||||||
|  |   this->HostBufferFreeAll();    // Clean up the buffer allocs | ||||||
|  | #ifndef NVLINK_GET | ||||||
|  |   this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers. | ||||||
|  | #endif    | ||||||
| } | } | ||||||
|  | #endif | ||||||
|  | //////////////////////////////////////////// | ||||||
|  | // END PIPELINE MODE / NO CUDA AWARE MPI | ||||||
|  | //////////////////////////////////////////// | ||||||
|  |  | ||||||
| void CartesianCommunicator::StencilBarrier(void) | void CartesianCommunicator::StencilBarrier(void) | ||||||
| { | { | ||||||
|   MPI_Barrier  (ShmComm); |   MPI_Barrier  (ShmComm); | ||||||
|   | |||||||
| @@ -132,6 +132,17 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, | |||||||
| { | { | ||||||
|   return 2.0*bytes; |   return 2.0*bytes; | ||||||
| } | } | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {}; | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {}; | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list, | ||||||
|  | 							   void *xmit, | ||||||
|  | 							   int xmit_to_rank,int dox, | ||||||
|  | 							   void *recv, | ||||||
|  | 							   int recv_from_rank,int dor, | ||||||
|  | 							   int xbytes,int rbytes, int dir) | ||||||
|  | { | ||||||
|  |   return 0.0; | ||||||
|  | } | ||||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 							 void *xmit, | 							 void *xmit, | ||||||
| 							 int xmit_to_rank,int dox, | 							 int xmit_to_rank,int dox, | ||||||
|   | |||||||
| @@ -46,8 +46,40 @@ NAMESPACE_BEGIN(Grid); | |||||||
|  |  | ||||||
| #if defined (GRID_COMMS_MPI3)  | #if defined (GRID_COMMS_MPI3)  | ||||||
| typedef MPI_Comm    Grid_MPI_Comm; | typedef MPI_Comm    Grid_MPI_Comm; | ||||||
|  | typedef MPI_Request MpiCommsRequest_t; | ||||||
|  | #ifdef ACCELERATOR_AWARE_MPI | ||||||
| typedef MPI_Request CommsRequest_t; | typedef MPI_Request CommsRequest_t; | ||||||
|  | #else | ||||||
|  | /* | ||||||
|  |  * Enable state transitions as each packet flows. | ||||||
|  |  */ | ||||||
|  | enum PacketType_t { | ||||||
|  |   FaceGather, | ||||||
|  |   InterNodeXmit, | ||||||
|  |   InterNodeRecv, | ||||||
|  |   IntraNodeXmit, | ||||||
|  |   IntraNodeRecv, | ||||||
|  |   InterNodeXmitISend, | ||||||
|  |   InterNodeReceiveHtoD | ||||||
|  | }; | ||||||
|  | /* | ||||||
|  |  *Package arguments needed for various actions along packet flow | ||||||
|  |  */ | ||||||
|  | typedef struct { | ||||||
|  |   PacketType_t PacketType; | ||||||
|  |   void *host_buf; | ||||||
|  |   void *device_buf; | ||||||
|  |   int dest; | ||||||
|  |   int tag; | ||||||
|  |   int commdir; | ||||||
|  |   unsigned long bytes; | ||||||
|  |   acceleratorEvent_t ev; | ||||||
|  |   MpiCommsRequest_t req; | ||||||
|  | } CommsRequest_t; | ||||||
|  | #endif | ||||||
|  |  | ||||||
| #else  | #else  | ||||||
|  | typedef int MpiCommsRequest_t; | ||||||
| typedef int CommsRequest_t; | typedef int CommsRequest_t; | ||||||
| typedef int Grid_MPI_Comm; | typedef int Grid_MPI_Comm; | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -42,6 +42,11 @@ Author: Christoph Lehner <christoph@lhnr.de> | |||||||
| #ifdef ACCELERATOR_AWARE_MPI | #ifdef ACCELERATOR_AWARE_MPI | ||||||
| #define GRID_SYCL_LEVEL_ZERO_IPC | #define GRID_SYCL_LEVEL_ZERO_IPC | ||||||
| #define SHM_SOCKETS | #define SHM_SOCKETS | ||||||
|  | #else | ||||||
|  | #ifdef HAVE_NUMAIF_H | ||||||
|  |   #warning " Using NUMAIF " | ||||||
|  | #include <numaif.h> | ||||||
|  | #endif  | ||||||
| #endif  | #endif  | ||||||
| #include <syscall.h> | #include <syscall.h> | ||||||
| #endif | #endif | ||||||
| @@ -537,7 +542,38 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|   // Each MPI rank should allocate our own buffer |   // Each MPI rank should allocate our own buffer | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| #ifndef ACCELERATOR_AWARE_MPI | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|   HostCommBuf= malloc(bytes); |   printf("Host buffer allocate for GPU non-aware MPI\n"); | ||||||
|  | #if 0 | ||||||
|  |   HostCommBuf= acceleratorAllocHost(bytes); | ||||||
|  | #else  | ||||||
|  |   HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host | ||||||
|  | #ifdef HAVE_NUMAIF_H | ||||||
|  |   #warning "Moving host buffers to specific NUMA domain" | ||||||
|  |   int numa; | ||||||
|  |   char *numa_name=(char *)getenv("MPI_BUF_NUMA"); | ||||||
|  |   if(numa_name) { | ||||||
|  |     unsigned long page_size = sysconf(_SC_PAGESIZE); | ||||||
|  |     numa = atoi(numa_name); | ||||||
|  |     unsigned long page_count = bytes/page_size; | ||||||
|  |     std::vector<void *> pages(page_count); | ||||||
|  |     std::vector<int>    nodes(page_count,numa); | ||||||
|  |     std::vector<int>    status(page_count,-1); | ||||||
|  |     for(unsigned long p=0;p<page_count;p++){ | ||||||
|  |       pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size); | ||||||
|  |     } | ||||||
|  |     int ret = move_pages(0, | ||||||
|  | 			 page_count, | ||||||
|  | 			 &pages[0], | ||||||
|  | 			 &nodes[0], | ||||||
|  | 			 &status[0], | ||||||
|  | 			 MPOL_MF_MOVE); | ||||||
|  |     printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret); | ||||||
|  |     if (ret) perror(" move_pages failed for reason:"); | ||||||
|  |   } | ||||||
|  | #endif   | ||||||
|  |   acceleratorPin(HostCommBuf,bytes); | ||||||
|  | #endif   | ||||||
|  |  | ||||||
| #endif   | #endif   | ||||||
|   ShmCommBuf = acceleratorAllocDevice(bytes); |   ShmCommBuf = acceleratorAllocDevice(bytes); | ||||||
|   if (ShmCommBuf == (void *)NULL ) { |   if (ShmCommBuf == (void *)NULL ) { | ||||||
| @@ -569,8 +605,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| #ifdef GRID_SYCL_LEVEL_ZERO_IPC | #ifdef GRID_SYCL_LEVEL_ZERO_IPC | ||||||
|     typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t; |     typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t; | ||||||
|  |  | ||||||
|     auto zeDevice    = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device()); |     auto zeDevice    = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device()); | ||||||
|     auto zeContext   = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context()); |     auto zeContext   = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context()); | ||||||
|        |        | ||||||
|     ze_ipc_mem_handle_t ihandle; |     ze_ipc_mem_handle_t ihandle; | ||||||
|     clone_mem_t handle; |     clone_mem_t handle; | ||||||
|   | |||||||
| @@ -51,7 +51,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #endif  | #endif  | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
| template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>  | template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>  | ||||||
| auto Cshift(const Expression &expr,int dim,int shift)  -> decltype(closure(expr))  | auto Cshift(const Expression &expr,int dim,int shift)  -> decltype(closure(expr))  | ||||||
| { | { | ||||||
|   | |||||||
| @@ -30,12 +30,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
| extern std::vector<std::pair<int,int> > Cshift_table;  | extern std::vector<std::pair<int,int> > Cshift_table;  | ||||||
| extern commVector<std::pair<int,int> > Cshift_table_device;  | extern deviceVector<std::pair<int,int> > Cshift_table_device;  | ||||||
|  |  | ||||||
| inline std::pair<int,int> *MapCshiftTable(void) | inline std::pair<int,int> *MapCshiftTable(void) | ||||||
| { | { | ||||||
|   // GPU version |   // GPU version | ||||||
| #ifdef ACCELERATOR_CSHIFT     |  | ||||||
|   uint64_t sz=Cshift_table.size(); |   uint64_t sz=Cshift_table.size(); | ||||||
|   if (Cshift_table_device.size()!=sz )    { |   if (Cshift_table_device.size()!=sz )    { | ||||||
|     Cshift_table_device.resize(sz); |     Cshift_table_device.resize(sz); | ||||||
| @@ -45,16 +44,13 @@ inline std::pair<int,int> *MapCshiftTable(void) | |||||||
| 			  sizeof(Cshift_table[0])*sz); | 			  sizeof(Cshift_table[0])*sz); | ||||||
|  |  | ||||||
|   return &Cshift_table_device[0]; |   return &Cshift_table_device[0]; | ||||||
| #else  |  | ||||||
|   return &Cshift_table[0]; |  | ||||||
| #endif |  | ||||||
|   // CPU version use identify map |   // CPU version use identify map | ||||||
| } | } | ||||||
| /////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////// | ||||||
| // Gather for when there is no need to SIMD split  | // Gather for when there is no need to SIMD split  | ||||||
| /////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////// | ||||||
| template<class vobj> void  | template<class vobj> void  | ||||||
| Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) | Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) | ||||||
| { | { | ||||||
|   int rd = rhs.Grid()->_rdimensions[dimension]; |   int rd = rhs.Grid()->_rdimensions[dimension]; | ||||||
|  |  | ||||||
| @@ -94,17 +90,10 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim | |||||||
|   { |   { | ||||||
|     auto buffer_p = & buffer[0]; |     auto buffer_p = & buffer[0]; | ||||||
|     auto table = MapCshiftTable(); |     auto table = MapCshiftTable(); | ||||||
| #ifdef ACCELERATOR_CSHIFT |  | ||||||
|     autoView(rhs_v , rhs, AcceleratorRead); |     autoView(rhs_v , rhs, AcceleratorRead); | ||||||
|     accelerator_for(i,ent,vobj::Nsimd(),{ |     accelerator_for(i,ent,vobj::Nsimd(),{ | ||||||
| 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); | 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); | ||||||
|     }); |     }); | ||||||
| #else |  | ||||||
|     autoView(rhs_v , rhs, CpuRead); |  | ||||||
|     thread_for(i,ent,{ |  | ||||||
|       buffer_p[table[i].first]=rhs_v[table[i].second]; |  | ||||||
|     }); |  | ||||||
| #endif |  | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -129,7 +118,6 @@ Gather_plane_extract(const Lattice<vobj> &rhs, | |||||||
|   int n1=rhs.Grid()->_slice_stride[dimension]; |   int n1=rhs.Grid()->_slice_stride[dimension]; | ||||||
|  |  | ||||||
|   if ( cbmask ==0x3){ |   if ( cbmask ==0x3){ | ||||||
| #ifdef ACCELERATOR_CSHIFT |  | ||||||
|     autoView(rhs_v , rhs, AcceleratorRead); |     autoView(rhs_v , rhs, AcceleratorRead); | ||||||
|     accelerator_for(nn,e1*e2,1,{ |     accelerator_for(nn,e1*e2,1,{ | ||||||
| 	int n = nn%e1; | 	int n = nn%e1; | ||||||
| @@ -140,21 +128,10 @@ Gather_plane_extract(const Lattice<vobj> &rhs, | |||||||
| 	vobj temp =rhs_v[so+o+b]; | 	vobj temp =rhs_v[so+o+b]; | ||||||
| 	extract<vobj>(temp,pointers,offset); | 	extract<vobj>(temp,pointers,offset); | ||||||
|       }); |       }); | ||||||
| #else |  | ||||||
|     autoView(rhs_v , rhs, CpuRead); |  | ||||||
|     thread_for2d(n,e1,b,e2,{ |  | ||||||
| 	int o      =   n*n1; |  | ||||||
| 	int offset = b+n*e2; |  | ||||||
| 	 |  | ||||||
| 	vobj temp =rhs_v[so+o+b]; |  | ||||||
| 	extract<vobj>(temp,pointers,offset); |  | ||||||
|       }); |  | ||||||
| #endif |  | ||||||
|   } else {  |   } else {  | ||||||
|     Coordinate rdim=rhs.Grid()->_rdimensions; |     Coordinate rdim=rhs.Grid()->_rdimensions; | ||||||
|     Coordinate cdm =rhs.Grid()->_checker_dim_mask; |     Coordinate cdm =rhs.Grid()->_checker_dim_mask; | ||||||
|     std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb? |     std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb? | ||||||
| #ifdef ACCELERATOR_CSHIFT     |  | ||||||
|     autoView(rhs_v , rhs, AcceleratorRead); |     autoView(rhs_v , rhs, AcceleratorRead); | ||||||
|     accelerator_for(nn,e1*e2,1,{ |     accelerator_for(nn,e1*e2,1,{ | ||||||
| 	int n = nn%e1; | 	int n = nn%e1; | ||||||
| @@ -175,33 +152,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs, | |||||||
| 	  extract<vobj>(temp,pointers,offset); | 	  extract<vobj>(temp,pointers,offset); | ||||||
| 	} | 	} | ||||||
|       }); |       }); | ||||||
| #else |  | ||||||
|     autoView(rhs_v , rhs, CpuRead); |  | ||||||
|     thread_for2d(n,e1,b,e2,{ |  | ||||||
|  |  | ||||||
| 	Coordinate coor; |  | ||||||
|  |  | ||||||
| 	int o=n*n1; |  | ||||||
| 	int oindex = o+b; |  | ||||||
|  |  | ||||||
|        	int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm); |  | ||||||
|  |  | ||||||
| 	int ocb=1<<cb; |  | ||||||
| 	int offset = b+n*e2; |  | ||||||
|  |  | ||||||
| 	if ( ocb & cbmask ) { |  | ||||||
| 	  vobj temp =rhs_v[so+o+b]; |  | ||||||
| 	  extract<vobj>(temp,pointers,offset); |  | ||||||
| 	} |  | ||||||
|       }); |  | ||||||
| #endif |  | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| ////////////////////////////////////////////////////// | ////////////////////////////////////////////////////// | ||||||
| // Scatter for when there is no need to SIMD split | // Scatter for when there is no need to SIMD split | ||||||
| ////////////////////////////////////////////////////// | ////////////////////////////////////////////////////// | ||||||
| template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask) | template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask) | ||||||
| { | { | ||||||
|   int rd = rhs.Grid()->_rdimensions[dimension]; |   int rd = rhs.Grid()->_rdimensions[dimension]; | ||||||
|  |  | ||||||
| @@ -245,17 +202,10 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector< | |||||||
|   { |   { | ||||||
|     auto buffer_p = & buffer[0]; |     auto buffer_p = & buffer[0]; | ||||||
|     auto table = MapCshiftTable(); |     auto table = MapCshiftTable(); | ||||||
| #ifdef ACCELERATOR_CSHIFT     |  | ||||||
|     autoView( rhs_v, rhs, AcceleratorWrite); |     autoView( rhs_v, rhs, AcceleratorWrite); | ||||||
|     accelerator_for(i,ent,vobj::Nsimd(),{ |     accelerator_for(i,ent,vobj::Nsimd(),{ | ||||||
| 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); | 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); | ||||||
|     }); |     }); | ||||||
| #else |  | ||||||
|     autoView( rhs_v, rhs, CpuWrite); |  | ||||||
|     thread_for(i,ent,{ |  | ||||||
|       rhs_v[table[i].first]=buffer_p[table[i].second]; |  | ||||||
|     }); |  | ||||||
| #endif |  | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -278,7 +228,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA | |||||||
|   if(cbmask ==0x3 ) { |   if(cbmask ==0x3 ) { | ||||||
|     int _slice_stride = rhs.Grid()->_slice_stride[dimension]; |     int _slice_stride = rhs.Grid()->_slice_stride[dimension]; | ||||||
|     int _slice_block = rhs.Grid()->_slice_block[dimension]; |     int _slice_block = rhs.Grid()->_slice_block[dimension]; | ||||||
| #ifdef ACCELERATOR_CSHIFT     |  | ||||||
|     autoView( rhs_v , rhs, AcceleratorWrite); |     autoView( rhs_v , rhs, AcceleratorWrite); | ||||||
|     accelerator_for(nn,e1*e2,1,{ |     accelerator_for(nn,e1*e2,1,{ | ||||||
| 	int n = nn%e1; | 	int n = nn%e1; | ||||||
| @@ -287,14 +236,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA | |||||||
| 	int offset = b+n*_slice_block; | 	int offset = b+n*_slice_block; | ||||||
| 	merge(rhs_v[so+o+b],pointers,offset); | 	merge(rhs_v[so+o+b],pointers,offset); | ||||||
|       }); |       }); | ||||||
| #else |  | ||||||
|     autoView( rhs_v , rhs, CpuWrite); |  | ||||||
|     thread_for2d(n,e1,b,e2,{ |  | ||||||
| 	int o      = n*_slice_stride; |  | ||||||
| 	int offset = b+n*_slice_block; |  | ||||||
| 	merge(rhs_v[so+o+b],pointers,offset); |  | ||||||
|     }); |  | ||||||
| #endif |  | ||||||
|   } else {  |   } else {  | ||||||
|  |  | ||||||
|     // Case of SIMD split AND checker dim cannot currently be hit, except in  |     // Case of SIMD split AND checker dim cannot currently be hit, except in  | ||||||
| @@ -360,19 +301,11 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs | |||||||
|  |  | ||||||
|   { |   { | ||||||
|     auto table = MapCshiftTable(); |     auto table = MapCshiftTable(); | ||||||
| #ifdef ACCELERATOR_CSHIFT     |  | ||||||
|     autoView(rhs_v , rhs, AcceleratorRead); |     autoView(rhs_v , rhs, AcceleratorRead); | ||||||
|     autoView(lhs_v , lhs, AcceleratorWrite); |     autoView(lhs_v , lhs, AcceleratorWrite); | ||||||
|     accelerator_for(i,ent,vobj::Nsimd(),{ |     accelerator_for(i,ent,vobj::Nsimd(),{ | ||||||
|       coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); |       coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); | ||||||
|     }); |     }); | ||||||
| #else |  | ||||||
|     autoView(rhs_v , rhs, CpuRead); |  | ||||||
|     autoView(lhs_v , lhs, CpuWrite); |  | ||||||
|     thread_for(i,ent,{ |  | ||||||
|       lhs_v[table[i].first]=rhs_v[table[i].second]; |  | ||||||
|     }); |  | ||||||
| #endif |  | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -412,19 +345,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo | |||||||
|  |  | ||||||
|   { |   { | ||||||
|     auto table = MapCshiftTable(); |     auto table = MapCshiftTable(); | ||||||
| #ifdef ACCELERATOR_CSHIFT     |  | ||||||
|     autoView( rhs_v, rhs, AcceleratorRead); |     autoView( rhs_v, rhs, AcceleratorRead); | ||||||
|     autoView( lhs_v, lhs, AcceleratorWrite); |     autoView( lhs_v, lhs, AcceleratorWrite); | ||||||
|     accelerator_for(i,ent,1,{ |     accelerator_for(i,ent,1,{ | ||||||
|       permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); |       permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); | ||||||
|     }); |     }); | ||||||
| #else |  | ||||||
|     autoView( rhs_v, rhs, CpuRead); |  | ||||||
|     autoView( lhs_v, lhs, CpuWrite); |  | ||||||
|     thread_for(i,ent,{ |  | ||||||
|       permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); |  | ||||||
|     }); |  | ||||||
| #endif |  | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid);  | NAMESPACE_BEGIN(Grid);  | ||||||
|  | const int Cshift_verbose=0; | ||||||
| template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) | template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) | ||||||
| { | { | ||||||
|   typedef typename vobj::vector_type vector_type; |   typedef typename vobj::vector_type vector_type; | ||||||
| @@ -55,17 +55,17 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension | |||||||
|   RealD t1,t0; |   RealD t1,t0; | ||||||
|   t0=usecond(); |   t0=usecond(); | ||||||
|   if ( !comm_dim ) { |   if ( !comm_dim ) { | ||||||
|     //std::cout << "CSHIFT: Cshift_local" <<std::endl; |     //    std::cout << "CSHIFT: Cshift_local" <<std::endl; | ||||||
|     Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding |     Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding | ||||||
|   } else if ( splice_dim ) { |   } else if ( splice_dim ) { | ||||||
|     //std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl; |     //    std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl; | ||||||
|     Cshift_comms_simd(ret,rhs,dimension,shift); |     Cshift_comms_simd(ret,rhs,dimension,shift); | ||||||
|   } else { |   } else { | ||||||
|     //std::cout << "CSHIFT: Cshift_comms" <<std::endl; |     //    std::cout << "CSHIFT: Cshift_comms" <<std::endl; | ||||||
|     Cshift_comms(ret,rhs,dimension,shift); |     Cshift_comms(ret,rhs,dimension,shift); | ||||||
|   } |   } | ||||||
|   t1=usecond(); |   t1=usecond(); | ||||||
|   //  std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl; |   if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl; | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -94,18 +94,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob | |||||||
|   sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); |   sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); | ||||||
|   sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); |   sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); | ||||||
|  |  | ||||||
|   //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; |   //  std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; | ||||||
|   if ( sshift[0] == sshift[1] ) { |   if ( sshift[0] == sshift[1] ) { | ||||||
|     //std::cout << "Single pass Cshift_comms" <<std::endl; |     //    std::cout << "Single pass Cshift_comms" <<std::endl; | ||||||
|     Cshift_comms_simd(ret,rhs,dimension,shift,0x3); |     Cshift_comms_simd(ret,rhs,dimension,shift,0x3); | ||||||
|   } else { |   } else { | ||||||
|     //std::cout << "Two pass Cshift_comms" <<std::endl; |     //    std::cout << "Two pass Cshift_comms" <<std::endl; | ||||||
|     Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes |     Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes | ||||||
|     Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration |     Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration | ||||||
|   } |   } | ||||||
| } | } | ||||||
| #define ACCELERATOR_CSHIFT_NO_COPY |  | ||||||
| #ifdef ACCELERATOR_CSHIFT_NO_COPY |  | ||||||
| template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | ||||||
| { | { | ||||||
|   typedef typename vobj::vector_type vector_type; |   typedef typename vobj::vector_type vector_type; | ||||||
| @@ -125,9 +123,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|   assert(shift<fd); |   assert(shift<fd); | ||||||
|    |    | ||||||
|   int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; |   int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; | ||||||
|   static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size); |   static deviceVector<vobj> send_buf; send_buf.resize(buffer_size); | ||||||
|   static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size); |   static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size); | ||||||
|      | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |   static hostVector<vobj> hsend_buf;  hsend_buf.resize(buffer_size); | ||||||
|  |   static hostVector<vobj> hrecv_buf;  hrecv_buf.resize(buffer_size); | ||||||
|  | #endif | ||||||
|  |    | ||||||
|   int cb= (cbmask==0x2)? Odd : Even; |   int cb= (cbmask==0x2)? Odd : Even; | ||||||
|   int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); |   int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); | ||||||
|   RealD tcopy=0.0; |   RealD tcopy=0.0; | ||||||
| @@ -158,18 +160,31 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|       //      int rank           = grid->_processor; |       //      int rank           = grid->_processor; | ||||||
|       int recv_from_rank; |       int recv_from_rank; | ||||||
|       int xmit_to_rank; |       int xmit_to_rank; | ||||||
|  |  | ||||||
|       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); |       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|        |        | ||||||
|       tcomms-=usecond(); |       tcomms-=usecond(); | ||||||
|       //      grid->Barrier(); |       grid->Barrier(); | ||||||
|  |  | ||||||
|  | #ifdef ACCELERATOR_AWARE_MPI | ||||||
|       grid->SendToRecvFrom((void *)&send_buf[0], |       grid->SendToRecvFrom((void *)&send_buf[0], | ||||||
| 			   xmit_to_rank, | 			   xmit_to_rank, | ||||||
| 			   (void *)&recv_buf[0], | 			   (void *)&recv_buf[0], | ||||||
| 			   recv_from_rank, | 			   recv_from_rank, | ||||||
| 			   bytes); | 			   bytes); | ||||||
|  | #else | ||||||
|  |       // bouncy bouncy | ||||||
|  |       acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes); | ||||||
|  |       grid->SendToRecvFrom((void *)&hsend_buf[0], | ||||||
|  | 			   xmit_to_rank, | ||||||
|  | 			   (void *)&hrecv_buf[0], | ||||||
|  | 			   recv_from_rank, | ||||||
|  | 			   bytes); | ||||||
|  |       acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|       xbytes+=bytes; |       xbytes+=bytes; | ||||||
|       //      grid->Barrier(); |       grid->Barrier(); | ||||||
|       tcomms+=usecond(); |       tcomms+=usecond(); | ||||||
|  |  | ||||||
|       tscatter-=usecond(); |       tscatter-=usecond(); | ||||||
| @@ -177,13 +192,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|       tscatter+=usecond(); |       tscatter+=usecond(); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   /* |   if (Cshift_verbose){ | ||||||
|   std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl; |     std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl; | ||||||
|   std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl; |     std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl; | ||||||
|   std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; |     std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; | ||||||
|   std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl; |     std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl; | ||||||
|   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; |     std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; | ||||||
|   */ |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | ||||||
| @@ -201,9 +216,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|   int simd_layout     = grid->_simd_layout[dimension]; |   int simd_layout     = grid->_simd_layout[dimension]; | ||||||
|   int comm_dim        = grid->_processors[dimension] >1 ; |   int comm_dim        = grid->_processors[dimension] >1 ; | ||||||
|  |  | ||||||
|   //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd |   //  std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd | ||||||
|   //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout  |   //	    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout  | ||||||
|   //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl; |   //	    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl; | ||||||
|  |  | ||||||
|   assert(comm_dim==1); |   assert(comm_dim==1); | ||||||
|   assert(simd_layout==2); |   assert(simd_layout==2); | ||||||
| @@ -224,16 +239,21 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|   int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; |   int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; | ||||||
|   //  int words = sizeof(vobj)/sizeof(vector_type); |   //  int words = sizeof(vobj)/sizeof(vector_type); | ||||||
|  |  | ||||||
|   static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd); |   static std::vector<deviceVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd); | ||||||
|   static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd); |   static std::vector<deviceVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd); | ||||||
|   scalar_object *  recv_buf_extract_mpi; |   scalar_object *  recv_buf_extract_mpi; | ||||||
|   scalar_object *  send_buf_extract_mpi; |   scalar_object *  send_buf_extract_mpi; | ||||||
|   |  | ||||||
|  |  | ||||||
|   for(int s=0;s<Nsimd;s++){ |   for(int s=0;s<Nsimd;s++){ | ||||||
|     send_buf_extract[s].resize(buffer_size); |     send_buf_extract[s].resize(buffer_size); | ||||||
|     recv_buf_extract[s].resize(buffer_size); |     recv_buf_extract[s].resize(buffer_size); | ||||||
|   } |   } | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |   hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size); | ||||||
|  |   hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size); | ||||||
|  | #endif | ||||||
|  |    | ||||||
|   int bytes = buffer_size*sizeof(scalar_object); |   int bytes = buffer_size*sizeof(scalar_object); | ||||||
|  |  | ||||||
|   ExtractPointerArray<scalar_object>  pointers(Nsimd); //  |   ExtractPointerArray<scalar_object>  pointers(Nsimd); //  | ||||||
| @@ -281,266 +301,50 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
| 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | ||||||
|  |  | ||||||
| 	tcomms-=usecond(); | 	tcomms-=usecond(); | ||||||
| 	//	grid->Barrier(); | 	grid->Barrier(); | ||||||
|  |  | ||||||
| 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; | 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; | ||||||
| 	recv_buf_extract_mpi = &recv_buf_extract[i][0]; | 	recv_buf_extract_mpi = &recv_buf_extract[i][0]; | ||||||
|  | #ifdef ACCELERATOR_AWARE_MPI | ||||||
| 	grid->SendToRecvFrom((void *)send_buf_extract_mpi, | 	grid->SendToRecvFrom((void *)send_buf_extract_mpi, | ||||||
| 			     xmit_to_rank, | 			     xmit_to_rank, | ||||||
| 			     (void *)recv_buf_extract_mpi, | 			     (void *)recv_buf_extract_mpi, | ||||||
| 			     recv_from_rank, | 			     recv_from_rank, | ||||||
| 			     bytes); | 			     bytes); | ||||||
|  | #else | ||||||
| 	xbytes+=bytes; |       // bouncy bouncy | ||||||
| 	//	grid->Barrier(); | 	acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes); | ||||||
| 	tcomms+=usecond(); | 	grid->SendToRecvFrom((void *)&hsend_buf[0], | ||||||
|  |  | ||||||
| 	rpointers[i] = &recv_buf_extract[i][0]; |  | ||||||
|       } else {  |  | ||||||
| 	rpointers[i] = &send_buf_extract[nbr_lane][0]; |  | ||||||
|       } |  | ||||||
|  |  | ||||||
|     } |  | ||||||
|     tscatter-=usecond(); |  | ||||||
|     Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); |  | ||||||
|     tscatter+=usecond(); |  | ||||||
|   } |  | ||||||
|   /* |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; |  | ||||||
|   */ |  | ||||||
| } |  | ||||||
| #else  |  | ||||||
| template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) |  | ||||||
| { |  | ||||||
|   typedef typename vobj::vector_type vector_type; |  | ||||||
|   typedef typename vobj::scalar_type scalar_type; |  | ||||||
|  |  | ||||||
|   GridBase *grid=rhs.Grid(); |  | ||||||
|   Lattice<vobj> temp(rhs.Grid()); |  | ||||||
|  |  | ||||||
|   int fd              = rhs.Grid()->_fdimensions[dimension]; |  | ||||||
|   int rd              = rhs.Grid()->_rdimensions[dimension]; |  | ||||||
|   int pd              = rhs.Grid()->_processors[dimension]; |  | ||||||
|   int simd_layout     = rhs.Grid()->_simd_layout[dimension]; |  | ||||||
|   int comm_dim        = rhs.Grid()->_processors[dimension] >1 ; |  | ||||||
|   assert(simd_layout==1); |  | ||||||
|   assert(comm_dim==1); |  | ||||||
|   assert(shift>=0); |  | ||||||
|   assert(shift<fd); |  | ||||||
|   RealD tcopy=0.0; |  | ||||||
|   RealD tgather=0.0; |  | ||||||
|   RealD tscatter=0.0; |  | ||||||
|   RealD tcomms=0.0; |  | ||||||
|   uint64_t xbytes=0; |  | ||||||
|    |  | ||||||
|   int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; |  | ||||||
|   static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size); |  | ||||||
|   static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size); |  | ||||||
|   vobj *send_buf; |  | ||||||
|   vobj *recv_buf; |  | ||||||
|   { |  | ||||||
|     grid->ShmBufferFreeAll(); |  | ||||||
|     size_t bytes = buffer_size*sizeof(vobj); |  | ||||||
|     send_buf=(vobj *)grid->ShmBufferMalloc(bytes); |  | ||||||
|     recv_buf=(vobj *)grid->ShmBufferMalloc(bytes); |  | ||||||
|   } |  | ||||||
|      |  | ||||||
|   int cb= (cbmask==0x2)? Odd : Even; |  | ||||||
|   int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); |  | ||||||
|  |  | ||||||
|   for(int x=0;x<rd;x++){        |  | ||||||
|  |  | ||||||
|     int sx        =  (x+sshift)%rd; |  | ||||||
|     int comm_proc = ((x+sshift)/rd)%pd; |  | ||||||
|      |  | ||||||
|     if (comm_proc==0) { |  | ||||||
|  |  | ||||||
|       tcopy-=usecond(); |  | ||||||
|       Copy_plane(ret,rhs,dimension,x,sx,cbmask);  |  | ||||||
|       tcopy+=usecond(); |  | ||||||
|  |  | ||||||
|     } else { |  | ||||||
|  |  | ||||||
|       int words = buffer_size; |  | ||||||
|       if (cbmask != 0x3) words=words>>1; |  | ||||||
|  |  | ||||||
|       int bytes = words * sizeof(vobj); |  | ||||||
|  |  | ||||||
|       tgather-=usecond(); |  | ||||||
|       Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); |  | ||||||
|       tgather+=usecond(); |  | ||||||
|  |  | ||||||
|       //      int rank           = grid->_processor; |  | ||||||
|       int recv_from_rank; |  | ||||||
|       int xmit_to_rank; |  | ||||||
|       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|       tcomms-=usecond(); |  | ||||||
|       //      grid->Barrier(); |  | ||||||
|  |  | ||||||
|       acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); |  | ||||||
|       grid->SendToRecvFrom((void *)&send_buf[0], |  | ||||||
| 			   xmit_to_rank, |  | ||||||
| 			   (void *)&recv_buf[0], |  | ||||||
| 			   recv_from_rank, |  | ||||||
| 			   bytes); |  | ||||||
|       xbytes+=bytes; |  | ||||||
|       acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); |  | ||||||
|  |  | ||||||
|       //      grid->Barrier(); |  | ||||||
|       tcomms+=usecond(); |  | ||||||
|  |  | ||||||
|       tscatter-=usecond(); |  | ||||||
|       Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); |  | ||||||
|       tscatter+=usecond(); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   /* |  | ||||||
|   std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; |  | ||||||
|   */ |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) |  | ||||||
| { |  | ||||||
|   GridBase *grid=rhs.Grid(); |  | ||||||
|   const int Nsimd = grid->Nsimd(); |  | ||||||
|   typedef typename vobj::vector_type vector_type; |  | ||||||
|   typedef typename vobj::scalar_object scalar_object; |  | ||||||
|   typedef typename vobj::scalar_type scalar_type; |  | ||||||
|     |  | ||||||
|   int fd = grid->_fdimensions[dimension]; |  | ||||||
|   int rd = grid->_rdimensions[dimension]; |  | ||||||
|   int ld = grid->_ldimensions[dimension]; |  | ||||||
|   int pd = grid->_processors[dimension]; |  | ||||||
|   int simd_layout     = grid->_simd_layout[dimension]; |  | ||||||
|   int comm_dim        = grid->_processors[dimension] >1 ; |  | ||||||
|  |  | ||||||
|   //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd |  | ||||||
|   //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout  |  | ||||||
|   //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl; |  | ||||||
|  |  | ||||||
|   assert(comm_dim==1); |  | ||||||
|   assert(simd_layout==2); |  | ||||||
|   assert(shift>=0); |  | ||||||
|   assert(shift<fd); |  | ||||||
|   RealD tcopy=0.0; |  | ||||||
|   RealD tgather=0.0; |  | ||||||
|   RealD tscatter=0.0; |  | ||||||
|   RealD tcomms=0.0; |  | ||||||
|   uint64_t xbytes=0; |  | ||||||
|  |  | ||||||
|   int permute_type=grid->PermuteType(dimension); |  | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////// |  | ||||||
|   // Simd direction uses an extract/merge pair |  | ||||||
|   /////////////////////////////////////////////// |  | ||||||
|   int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; |  | ||||||
|   //  int words = sizeof(vobj)/sizeof(vector_type); |  | ||||||
|  |  | ||||||
|   static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd); |  | ||||||
|   static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd); |  | ||||||
|   scalar_object *  recv_buf_extract_mpi; |  | ||||||
|   scalar_object *  send_buf_extract_mpi; |  | ||||||
|   { |  | ||||||
|     size_t bytes = sizeof(scalar_object)*buffer_size; |  | ||||||
|     grid->ShmBufferFreeAll(); |  | ||||||
|     send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); |  | ||||||
|     recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); |  | ||||||
|   } |  | ||||||
|   for(int s=0;s<Nsimd;s++){ |  | ||||||
|     send_buf_extract[s].resize(buffer_size); |  | ||||||
|     recv_buf_extract[s].resize(buffer_size); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   int bytes = buffer_size*sizeof(scalar_object); |  | ||||||
|  |  | ||||||
|   ExtractPointerArray<scalar_object>  pointers(Nsimd); //  |  | ||||||
|   ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers |  | ||||||
|  |  | ||||||
|   /////////////////////////////////////////// |  | ||||||
|   // Work out what to send where |  | ||||||
|   /////////////////////////////////////////// |  | ||||||
|   int cb    = (cbmask==0x2)? Odd : Even; |  | ||||||
|   int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); |  | ||||||
|  |  | ||||||
|   // loop over outer coord planes orthog to dim |  | ||||||
|   for(int x=0;x<rd;x++){        |  | ||||||
|  |  | ||||||
|     // FIXME call local permute copy if none are offnode. |  | ||||||
|     for(int i=0;i<Nsimd;i++){        |  | ||||||
|       pointers[i] = &send_buf_extract[i][0]; |  | ||||||
|     } |  | ||||||
|     tgather-=usecond(); |  | ||||||
|     int sx   = (x+sshift)%rd; |  | ||||||
|     Gather_plane_extract(rhs,pointers,dimension,sx,cbmask); |  | ||||||
|     tgather+=usecond(); |  | ||||||
|  |  | ||||||
|     for(int i=0;i<Nsimd;i++){ |  | ||||||
|        |  | ||||||
|       int inner_bit = (Nsimd>>(permute_type+1)); |  | ||||||
|       int ic= (i&inner_bit)? 1:0; |  | ||||||
|  |  | ||||||
|       int my_coor          = rd*ic + x; |  | ||||||
|       int nbr_coor         = my_coor+sshift; |  | ||||||
|       int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors |  | ||||||
|  |  | ||||||
|       int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer |  | ||||||
|       int nbr_ox   = (nbr_coor%rd);       // outer coord of peer |  | ||||||
|       int nbr_lane = (i&(~inner_bit)); |  | ||||||
|  |  | ||||||
|       int recv_from_rank; |  | ||||||
|       int xmit_to_rank; |  | ||||||
|  |  | ||||||
|       if (nbr_ic) nbr_lane|=inner_bit; |  | ||||||
|  |  | ||||||
|       assert (sx == nbr_ox); |  | ||||||
|  |  | ||||||
|       if(nbr_proc){ |  | ||||||
| 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  |  | ||||||
|  |  | ||||||
| 	tcomms-=usecond(); |  | ||||||
| 	//	grid->Barrier(); |  | ||||||
|  |  | ||||||
| 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); |  | ||||||
| 	grid->SendToRecvFrom((void *)send_buf_extract_mpi, |  | ||||||
| 			     xmit_to_rank, | 			     xmit_to_rank, | ||||||
| 			     (void *)recv_buf_extract_mpi, | 			     (void *)&hrecv_buf[0], | ||||||
| 			     recv_from_rank, | 			     recv_from_rank, | ||||||
| 			     bytes); | 			     bytes); | ||||||
| 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); | 	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes); | ||||||
| 	xbytes+=bytes; |  | ||||||
|  |  | ||||||
| 	//	grid->Barrier(); |  | ||||||
| 	tcomms+=usecond(); |  | ||||||
| 	rpointers[i] = &recv_buf_extract[i][0]; |  | ||||||
|       } else {  |  | ||||||
| 	rpointers[i] = &send_buf_extract[nbr_lane][0]; |  | ||||||
|       } |  | ||||||
|  |  | ||||||
|     } |  | ||||||
|     tscatter-=usecond(); |  | ||||||
|     Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); |  | ||||||
|     tscatter+=usecond(); |  | ||||||
|  |  | ||||||
|   } |  | ||||||
|   /* |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl; |  | ||||||
|   */ |  | ||||||
| } |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | 	xbytes+=bytes; | ||||||
|  | 	grid->Barrier(); | ||||||
|  | 	tcomms+=usecond(); | ||||||
|  |  | ||||||
|  | 	rpointers[i] = &recv_buf_extract[i][0]; | ||||||
|  |       } else {  | ||||||
|  | 	rpointers[i] = &send_buf_extract[nbr_lane][0]; | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |     tscatter-=usecond(); | ||||||
|  |     Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); | ||||||
|  |     tscatter+=usecond(); | ||||||
|  |   } | ||||||
|  |   if(Cshift_verbose){ | ||||||
|  |     std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl; | ||||||
|  |     std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl; | ||||||
|  |     std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; | ||||||
|  |     std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl; | ||||||
|  |     std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid);  | NAMESPACE_END(Grid);  | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -1,5 +1,5 @@ | |||||||
| #include <Grid/GridCore.h>        | #include <Grid/GridCore.h>        | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
| std::vector<std::pair<int,int> > Cshift_table;  | std::vector<std::pair<int,int> > Cshift_table;  | ||||||
| commVector<std::pair<int,int> > Cshift_table_device;  | deviceVector<std::pair<int,int> > Cshift_table_device;  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|   | |||||||
| @@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice | |||||||
|   }); |   }); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #define FAST_AXPY_NORM | ||||||
| template<class sobj,class vobj> inline | template<class sobj,class vobj> inline | ||||||
| RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) | RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) | ||||||
| { | { | ||||||
|   GRID_TRACE("axpy_norm"); |   GRID_TRACE("axpy_norm"); | ||||||
|     return axpy_norm_fast(ret,a,x,y); | #ifdef FAST_AXPY_NORM | ||||||
|  |   return axpy_norm_fast(ret,a,x,y); | ||||||
|  | #else | ||||||
|  |   ret = a*x+y; | ||||||
|  |   RealD nn=norm2(ret); | ||||||
|  |   return nn; | ||||||
|  | #endif | ||||||
| } | } | ||||||
| template<class sobj,class vobj> inline | template<class sobj,class vobj> inline | ||||||
| RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) | RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) | ||||||
| { | { | ||||||
|   GRID_TRACE("axpby_norm"); |   GRID_TRACE("axpby_norm"); | ||||||
|     return axpby_norm_fast(ret,a,b,x,y); | #ifdef FAST_AXPY_NORM | ||||||
|  |   return axpby_norm_fast(ret,a,b,x,y); | ||||||
|  | #else | ||||||
|  |   ret = a*x+b*y; | ||||||
|  |   RealD nn=norm2(ret); | ||||||
|  |   return nn; | ||||||
|  | #endif | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Trace product | /// Trace product | ||||||
|   | |||||||
| @@ -237,9 +237,12 @@ public: | |||||||
|     vobj vtmp; |     vobj vtmp; | ||||||
|     vtmp = r; |     vtmp = r; | ||||||
| #if 0 | #if 0 | ||||||
|  |     deviceVector<vobj> vvtmp(1); | ||||||
|  |     acceleratorPut(vvtmp[0],vtmp); | ||||||
|  |     vobj *vvtmp_p = & vvtmp[0]; | ||||||
|     auto me  = View(AcceleratorWrite); |     auto me  = View(AcceleratorWrite); | ||||||
|     accelerator_for(ss,me.size(),vobj::Nsimd(),{ |     accelerator_for(ss,me.size(),vobj::Nsimd(),{ | ||||||
| 	auto stmp=coalescedRead(vtmp); | 	auto stmp=coalescedRead(*vvtmp_p); | ||||||
| 	coalescedWrite(me[ss],stmp); | 	coalescedWrite(me[ss],stmp); | ||||||
|     }); |     }); | ||||||
| #else     | #else     | ||||||
|   | |||||||
| @@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) | |||||||
|   typedef decltype(basis[0]) Field; |   typedef decltype(basis[0]) Field; | ||||||
|   typedef decltype(basis[0].View(AcceleratorRead)) View; |   typedef decltype(basis[0].View(AcceleratorRead)) View; | ||||||
|  |  | ||||||
|   Vector<View> basis_v; basis_v.reserve(basis.size()); |   hostVector<View>  h_basis_v(basis.size()); | ||||||
|   typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj; |   deviceVector<View> d_basis_v(basis.size()); | ||||||
|  |   typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj; | ||||||
|   typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t; |   typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t; | ||||||
|  |  | ||||||
|   GridBase* grid = basis[0].Grid(); |   GridBase* grid = basis[0].Grid(); | ||||||
|        |        | ||||||
|   for(int k=0;k<basis.size();k++){ |   for(int k=0;k<basis.size();k++){ | ||||||
|     basis_v.push_back(basis[k].View(AcceleratorWrite)); |     h_basis_v[k] = basis[k].View(AcceleratorWrite); | ||||||
|  |     acceleratorPut(d_basis_v[k],h_basis_v[k]); | ||||||
|   } |   } | ||||||
|  |  | ||||||
| #if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) ) |   View *basis_vp = &d_basis_v[0]; | ||||||
|   int max_threads = thread_max(); |  | ||||||
|   Vector < vobj > Bt(Nm * max_threads); |  | ||||||
|   thread_region |  | ||||||
|     { |  | ||||||
|       vobj* B = &Bt[Nm * thread_num()]; |  | ||||||
|       thread_for_in_region(ss, grid->oSites(),{ |  | ||||||
| 	  for(int j=j0; j<j1; ++j) B[j]=0.; |  | ||||||
|        |  | ||||||
| 	  for(int j=j0; j<j1; ++j){ |  | ||||||
| 	    for(int k=k0; k<k1; ++k){ |  | ||||||
| 	      B[j] +=Qt(j,k) * basis_v[k][ss]; |  | ||||||
| 	    } |  | ||||||
| 	  } |  | ||||||
| 	  for(int j=j0; j<j1; ++j){ |  | ||||||
| 	    basis_v[j][ss] = B[j]; |  | ||||||
| 	  } |  | ||||||
| 	}); |  | ||||||
|     } |  | ||||||
| #else |  | ||||||
|   View *basis_vp = &basis_v[0]; |  | ||||||
|  |  | ||||||
|   int nrot = j1-j0; |   int nrot = j1-j0; | ||||||
|   if (!nrot) // edge case not handled gracefully by Cuda |   if (!nrot) // edge case not handled gracefully by Cuda | ||||||
| @@ -91,17 +74,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) | |||||||
|   uint64_t oSites   =grid->oSites(); |   uint64_t oSites   =grid->oSites(); | ||||||
|   uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead |   uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead | ||||||
|  |  | ||||||
|   Vector <vobj> Bt(siteBlock * nrot);  |   deviceVector <vobj> Bt(siteBlock * nrot);  | ||||||
|   auto Bp=&Bt[0]; |   auto Bp=&Bt[0]; | ||||||
|  |  | ||||||
|   // GPU readable copy of matrix |   // GPU readable copy of matrix | ||||||
|   Vector<Coeff_t> Qt_jv(Nm*Nm); |   hostVector<Coeff_t> h_Qt_jv(Nm*Nm); | ||||||
|  |   deviceVector<Coeff_t> Qt_jv(Nm*Nm); | ||||||
|   Coeff_t *Qt_p = & Qt_jv[0]; |   Coeff_t *Qt_p = & Qt_jv[0]; | ||||||
|   thread_for(i,Nm*Nm,{ |   thread_for(i,Nm*Nm,{ | ||||||
|       int j = i/Nm; |       int j = i/Nm; | ||||||
|       int k = i%Nm; |       int k = i%Nm; | ||||||
|       Qt_p[i]=Qt(j,k); |       h_Qt_jv[i]=Qt(j,k); | ||||||
|   }); |   }); | ||||||
|  |   acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t)); | ||||||
|  |  | ||||||
|   // Block the loop to keep storage footprint down |   // Block the loop to keep storage footprint down | ||||||
|   for(uint64_t s=0;s<oSites;s+=siteBlock){ |   for(uint64_t s=0;s<oSites;s+=siteBlock){ | ||||||
| @@ -137,9 +122,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) | |||||||
| 	coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j])); | 	coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j])); | ||||||
|       }); |       }); | ||||||
|   } |   } | ||||||
| #endif |  | ||||||
|  |  | ||||||
|   for(int k=0;k<basis.size();k++) basis_v[k].ViewClose(); |   for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose(); | ||||||
| } | } | ||||||
|  |  | ||||||
| // Extract a single rotated vector | // Extract a single rotated vector | ||||||
| @@ -152,16 +136,19 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in | |||||||
|  |  | ||||||
|   result.Checkerboard() = basis[0].Checkerboard(); |   result.Checkerboard() = basis[0].Checkerboard(); | ||||||
|  |  | ||||||
|   Vector<View> basis_v; basis_v.reserve(basis.size()); |   hostVector<View>  h_basis_v(basis.size()); | ||||||
|  |   deviceVector<View> d_basis_v(basis.size()); | ||||||
|   for(int k=0;k<basis.size();k++){ |   for(int k=0;k<basis.size();k++){ | ||||||
|     basis_v.push_back(basis[k].View(AcceleratorRead)); |     h_basis_v[k]=basis[k].View(AcceleratorRead); | ||||||
|  |     acceleratorPut(d_basis_v[k],h_basis_v[k]); | ||||||
|   } |   } | ||||||
|   vobj zz=Zero(); |  | ||||||
|   Vector<double> Qt_jv(Nm); |  | ||||||
|   double * Qt_j = & Qt_jv[0]; |  | ||||||
|   for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k); |  | ||||||
|  |  | ||||||
|   auto basis_vp=& basis_v[0]; |   vobj zz=Zero(); | ||||||
|  |   deviceVector<double> Qt_jv(Nm); | ||||||
|  |   double * Qt_j = & Qt_jv[0]; | ||||||
|  |   for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k)); | ||||||
|  |  | ||||||
|  |   auto basis_vp=& d_basis_v[0]; | ||||||
|   autoView(result_v,result,AcceleratorWrite); |   autoView(result_v,result,AcceleratorWrite); | ||||||
|   accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ |   accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ | ||||||
|     vobj zzz=Zero(); |     vobj zzz=Zero(); | ||||||
| @@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in | |||||||
|     } |     } | ||||||
|     coalescedWrite(result_v[ss], B); |     coalescedWrite(result_v[ss], B); | ||||||
|   }); |   }); | ||||||
|   for(int k=0;k<basis.size();k++) basis_v[k].ViewClose(); |   for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose(); | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class Field> | template<class Field> | ||||||
|   | |||||||
| @@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites) | |||||||
|   //  const int Nsimd = vobj::Nsimd(); |   //  const int Nsimd = vobj::Nsimd(); | ||||||
|   const int nthread = GridThread::GetThreads(); |   const int nthread = GridThread::GetThreads(); | ||||||
|  |  | ||||||
|   Vector<sobj> sumarray(nthread); |   std::vector<sobj> sumarray(nthread); | ||||||
|   for(int i=0;i<nthread;i++){ |   for(int i=0;i<nthread;i++){ | ||||||
|     sumarray[i]=Zero(); |     sumarray[i]=Zero(); | ||||||
|   } |   } | ||||||
| @@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites) | |||||||
|  |  | ||||||
|   const int nthread = GridThread::GetThreads(); |   const int nthread = GridThread::GetThreads(); | ||||||
|  |  | ||||||
|   Vector<sobj> sumarray(nthread); |   std::vector<sobj> sumarray(nthread); | ||||||
|   for(int i=0;i<nthread;i++){ |   for(int i=0;i<nthread;i++){ | ||||||
|     sumarray[i]=Zero(); |     sumarray[i]=Zero(); | ||||||
|   } |   } | ||||||
| @@ -290,8 +290,10 @@ template<class vobj> | |||||||
| inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { | inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { | ||||||
|   GridBase *grid = left.Grid(); |   GridBase *grid = left.Grid(); | ||||||
|  |  | ||||||
|  |   bool ok; | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|   uint64_t csum=0; |   uint64_t csum=0; | ||||||
|  |   uint64_t csum2=0; | ||||||
|   if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) |   if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) | ||||||
|   { |   { | ||||||
|     // Hack |     // Hack | ||||||
| @@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ | |||||||
|     Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); |     Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); | ||||||
|     uint64_t *base= (uint64_t *)&l_v[0]; |     uint64_t *base= (uint64_t *)&l_v[0]; | ||||||
|     csum=svm_xor(base,words); |     csum=svm_xor(base,words); | ||||||
|  |     ok = FlightRecorder::CsumLog(csum); | ||||||
|  |     if ( !ok ) { | ||||||
|  |       csum2=svm_xor(base,words); | ||||||
|  |       std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl; | ||||||
|  |     } else { | ||||||
|  |       //      csum2=svm_xor(base,words); | ||||||
|  |       //      std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl; | ||||||
|  |     } | ||||||
|  |     assert(ok); | ||||||
|   } |   } | ||||||
|   FlightRecorder::CsumLog(csum); |  | ||||||
| #endif | #endif | ||||||
|  |   FlightRecorder::StepLog("rank inner product"); | ||||||
|   ComplexD nrm = rankInnerProduct(left,right); |   ComplexD nrm = rankInnerProduct(left,right); | ||||||
|  |   //  ComplexD nrmck=nrm; | ||||||
|   RealD local = real(nrm); |   RealD local = real(nrm); | ||||||
|   FlightRecorder::NormLog(real(nrm));  |   ok = FlightRecorder::NormLog(real(nrm)); | ||||||
|  |   if ( !ok ) { | ||||||
|  |     ComplexD nrm2 = rankInnerProduct(left,right); | ||||||
|  |     RealD local2 = real(nrm2); | ||||||
|  |     std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl; | ||||||
|  |     assert(ok); | ||||||
|  |   } | ||||||
|  |   FlightRecorder::StepLog("Start global sum"); | ||||||
|  |   //  grid->GlobalSumP2P(nrm); | ||||||
|   grid->GlobalSum(nrm); |   grid->GlobalSum(nrm); | ||||||
|  |   FlightRecorder::StepLog("Finished global sum"); | ||||||
|  |   //  std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl; | ||||||
|   FlightRecorder::ReductionLog(local,real(nrm));  |   FlightRecorder::ReductionLog(local,real(nrm));  | ||||||
|   return nrm; |   return nrm; | ||||||
| } | } | ||||||
| @@ -343,18 +365,6 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt | |||||||
|   autoView( x_v, x, AcceleratorRead); |   autoView( x_v, x, AcceleratorRead); | ||||||
|   autoView( y_v, y, AcceleratorRead); |   autoView( y_v, y, AcceleratorRead); | ||||||
|   autoView( z_v, z, AcceleratorWrite); |   autoView( z_v, z, AcceleratorWrite); | ||||||
| #if 0 |  | ||||||
|   typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; |  | ||||||
|   Vector<inner_t> inner_tmp(sites); |  | ||||||
|   auto inner_tmp_v = &inner_tmp[0]; |  | ||||||
|  |  | ||||||
|   accelerator_for( ss, sites, nsimd,{ |  | ||||||
|       auto tmp = a*x_v(ss)+b*y_v(ss); |  | ||||||
|       coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp)); |  | ||||||
|       coalescedWrite(z_v[ss],tmp); |  | ||||||
|   }); |  | ||||||
|   nrm = real(TensorRemove(sum(inner_tmp_v,sites))); |  | ||||||
| #else |  | ||||||
|   typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; |   typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; | ||||||
|   deviceVector<inner_t> inner_tmp; |   deviceVector<inner_t> inner_tmp; | ||||||
|   inner_tmp.resize(sites); |   inner_tmp.resize(sites); | ||||||
| @@ -365,9 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt | |||||||
|       coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); |       coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); | ||||||
|       coalescedWrite(z_v[ss],tmp); |       coalescedWrite(z_v[ss],tmp); | ||||||
|   }); |   }); | ||||||
|   nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); |   bool ok; | ||||||
|  | #ifdef GRID_SYCL | ||||||
|  |   uint64_t csum=0; | ||||||
|  |   uint64_t csum2=0; | ||||||
|  |   if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) | ||||||
|  |   { | ||||||
|  |     // z_v | ||||||
|  |     { | ||||||
|  |       Integer words = sites*sizeof(vobj)/sizeof(uint64_t); | ||||||
|  |       uint64_t *base= (uint64_t *)&z_v[0]; | ||||||
|  |       csum=svm_xor(base,words); | ||||||
|  |       ok = FlightRecorder::CsumLog(csum); | ||||||
|  |       if ( !ok ) { | ||||||
|  | 	csum2=svm_xor(base,words); | ||||||
|  | 	std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl; | ||||||
|  |       } | ||||||
|  |       assert(ok); | ||||||
|  |     } | ||||||
|  |     // inner_v | ||||||
|  |     { | ||||||
|  |       Integer words = sites*sizeof(inner_t)/sizeof(uint64_t); | ||||||
|  |       uint64_t *base= (uint64_t *)&inner_tmp_v[0]; | ||||||
|  |       csum=svm_xor(base,words); | ||||||
|  |       ok = FlightRecorder::CsumLog(csum); | ||||||
|  |       if ( !ok ) { | ||||||
|  | 	csum2=svm_xor(base,words); | ||||||
|  | 	std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl; | ||||||
|  |       } | ||||||
|  |       assert(ok); | ||||||
|  |     } | ||||||
|  |   } | ||||||
| #endif | #endif | ||||||
|  |   nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); | ||||||
|  |   ok = FlightRecorder::NormLog(real(nrm)); | ||||||
|  |   assert(ok); | ||||||
|  |   RealD local = real(nrm); | ||||||
|   grid->GlobalSum(nrm); |   grid->GlobalSum(nrm); | ||||||
|  |   FlightRecorder::ReductionLog(local,real(nrm)); | ||||||
|   return nrm;  |   return nrm;  | ||||||
| } | } | ||||||
|   |   | ||||||
| @@ -377,7 +422,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti | |||||||
|   conformable(left,right); |   conformable(left,right); | ||||||
|  |  | ||||||
|   typedef typename vobj::vector_typeD vector_type; |   typedef typename vobj::vector_typeD vector_type; | ||||||
|   Vector<ComplexD> tmp(2); |   std::vector<ComplexD> tmp(2); | ||||||
|  |  | ||||||
|   GridBase *grid = left.Grid(); |   GridBase *grid = left.Grid(); | ||||||
|  |  | ||||||
| @@ -387,8 +432,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti | |||||||
|   // GPU |   // GPU | ||||||
|   typedef decltype(innerProductD(vobj(),vobj())) inner_t; |   typedef decltype(innerProductD(vobj(),vobj())) inner_t; | ||||||
|   typedef decltype(innerProductD(vobj(),vobj())) norm_t; |   typedef decltype(innerProductD(vobj(),vobj())) norm_t; | ||||||
|   Vector<inner_t> inner_tmp(sites); |   deviceVector<inner_t> inner_tmp(sites); | ||||||
|   Vector<norm_t>  norm_tmp(sites); |   deviceVector<norm_t>  norm_tmp(sites); | ||||||
|   auto inner_tmp_v = &inner_tmp[0]; |   auto inner_tmp_v = &inner_tmp[0]; | ||||||
|   auto norm_tmp_v = &norm_tmp[0]; |   auto norm_tmp_v = &norm_tmp[0]; | ||||||
|   { |   { | ||||||
| @@ -438,7 +483,9 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) | |||||||
| // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... | // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... | ||||||
| ////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim) | template<class vobj> inline void sliceSum(const Lattice<vobj> &Data, | ||||||
|  | 					  std::vector<typename vobj::scalar_object> &result, | ||||||
|  | 					  int orthogdim) | ||||||
| { | { | ||||||
|   /////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////// | ||||||
|   // FIXME precision promoted summation |   // FIXME precision promoted summation | ||||||
| @@ -460,8 +507,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector< | |||||||
|   int ld=grid->_ldimensions[orthogdim]; |   int ld=grid->_ldimensions[orthogdim]; | ||||||
|   int rd=grid->_rdimensions[orthogdim]; |   int rd=grid->_rdimensions[orthogdim]; | ||||||
|  |  | ||||||
|   Vector<vobj> lvSum(rd); // will locally sum vectors first |   std::vector<vobj> lvSum(rd); // will locally sum vectors first | ||||||
|   Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars |   std::vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars | ||||||
|   ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD |   ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD | ||||||
|  |  | ||||||
|   result.resize(fd); // And then global sum to return the same vector to every node  |   result.resize(fd); // And then global sum to return the same vector to every node  | ||||||
| @@ -509,6 +556,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector< | |||||||
|   scalar_type * ptr = (scalar_type *) &result[0]; |   scalar_type * ptr = (scalar_type *) &result[0]; | ||||||
|   int words = fd*sizeof(sobj)/sizeof(scalar_type); |   int words = fd*sizeof(sobj)/sizeof(scalar_type); | ||||||
|   grid->GlobalSumVector(ptr, words); |   grid->GlobalSumVector(ptr, words); | ||||||
|  |   //  std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl; | ||||||
|  |    | ||||||
| } | } | ||||||
| template<class vobj> inline | template<class vobj> inline | ||||||
| std::vector<typename vobj::scalar_object>  | std::vector<typename vobj::scalar_object>  | ||||||
| @@ -552,8 +601,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti | |||||||
|   int ld=grid->_ldimensions[orthogdim]; |   int ld=grid->_ldimensions[orthogdim]; | ||||||
|   int rd=grid->_rdimensions[orthogdim]; |   int rd=grid->_rdimensions[orthogdim]; | ||||||
|  |  | ||||||
|   Vector<vector_type> lvSum(rd); // will locally sum vectors first |   std::vector<vector_type> lvSum(rd); // will locally sum vectors first | ||||||
|   Vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars |   std::vector<scalar_type > lsSum(ld,scalar_type(0.0));                    // sum across these down to scalars | ||||||
|   ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD   |   ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd);   // splitting the SIMD   | ||||||
|  |  | ||||||
|   result.resize(fd); // And then global sum to return the same vector to every node for IO to file |   result.resize(fd); // And then global sum to return the same vector to every node for IO to file | ||||||
|   | |||||||
| @@ -214,22 +214,12 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi | |||||||
|   // Move out of UVM |   // Move out of UVM | ||||||
|   // Turns out I had messed up the synchronise after move to compute stream |   // Turns out I had messed up the synchronise after move to compute stream | ||||||
|   // as running this on the default stream fools the synchronise |   // as running this on the default stream fools the synchronise | ||||||
| #undef UVM_BLOCK_BUFFER   |   deviceVector<sobj> buffer(numBlocks); | ||||||
| #ifndef UVM_BLOCK_BUFFER   |  | ||||||
|   commVector<sobj> buffer(numBlocks); |  | ||||||
|   sobj *buffer_v = &buffer[0]; |   sobj *buffer_v = &buffer[0]; | ||||||
|   sobj result; |   sobj result; | ||||||
|   reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); |   reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); | ||||||
|   accelerator_barrier(); |   accelerator_barrier(); | ||||||
|   acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); |   acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); | ||||||
| #else |  | ||||||
|   Vector<sobj> buffer(numBlocks); |  | ||||||
|   sobj *buffer_v = &buffer[0]; |  | ||||||
|   sobj result; |  | ||||||
|   reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); |  | ||||||
|   accelerator_barrier(); |  | ||||||
|   result = *buffer_v; |  | ||||||
| #endif |  | ||||||
|   return result; |   return result; | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -244,7 +234,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi | |||||||
|    |    | ||||||
|   const int words = sizeof(vobj)/sizeof(vector); |   const int words = sizeof(vobj)/sizeof(vector); | ||||||
|  |  | ||||||
|   Vector<vector> buffer(osites); |   deviceVector<vector> buffer(osites); | ||||||
|   vector *dat = (vector *)lat; |   vector *dat = (vector *)lat; | ||||||
|   vector *buf = &buffer[0]; |   vector *buf = &buffer[0]; | ||||||
|   iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0]; |   iScalar<vector> *tbuf =(iScalar<vector> *)  &buffer[0]; | ||||||
|   | |||||||
| @@ -4,33 +4,28 @@ NAMESPACE_BEGIN(Grid); | |||||||
| // Possibly promote to double and sum | // Possibly promote to double and sum | ||||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|  |  | ||||||
| template <class vobj> | template <class vobj> | ||||||
| inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)  | inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)  | ||||||
| { | { | ||||||
|   typedef typename vobj::scalar_object sobj; |   typedef typename vobj::scalar_object sobj; | ||||||
|   typedef typename vobj::scalar_objectD sobjD; |   typedef typename vobj::scalar_objectD sobjD; | ||||||
|   static Vector<sobj> mysum; |  | ||||||
|   mysum.resize(1); |  | ||||||
|   sobj *mysum_p = & mysum[0]; |  | ||||||
|   sobj identity; zeroit(identity); |   sobj identity; zeroit(identity); | ||||||
|   mysum[0] = identity; |   sobj ret; zeroit(ret); | ||||||
|   sobj ret ;  |  | ||||||
|  |  | ||||||
|   Integer nsimd= vobj::Nsimd(); |   Integer nsimd= vobj::Nsimd(); | ||||||
|  |   {  | ||||||
|   const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); |     sycl::buffer<sobj, 1> abuff(&ret, {1}); | ||||||
|   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { |     theGridAccelerator->submit([&](sycl::handler &cgh) { | ||||||
|     auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList); |       auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>()); | ||||||
|      cgh.parallel_for(cl::sycl::range<1>{osites}, |       cgh.parallel_for(sycl::range<1>{osites}, | ||||||
| 		      Reduction, |                       Reduction, | ||||||
| 		      [=] (cl::sycl::id<1> item, auto &sum) { |                       [=] (sycl::id<1> item, auto &sum) { | ||||||
|       auto osite   = item[0]; |                         auto osite   = item[0]; | ||||||
|       sum +=Reduce(lat[osite]); |                         sum +=Reduce(lat[osite]); | ||||||
|      }); |                       }); | ||||||
|    }); |     }); | ||||||
|   theGridAccelerator->wait(); |   } | ||||||
|   ret = mysum[0]; |  | ||||||
|   //  free(mysum,*theGridAccelerator); |  | ||||||
|   sobjD dret; convertType(dret,ret); |   sobjD dret; convertType(dret,ret); | ||||||
|   return dret; |   return dret; | ||||||
| } | } | ||||||
| @@ -76,59 +71,22 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite | |||||||
|  |  | ||||||
| template<class Word> Word svm_xor(Word *vec,uint64_t L) | template<class Word> Word svm_xor(Word *vec,uint64_t L) | ||||||
| { | { | ||||||
|   Word xorResult; xorResult = 0; |  | ||||||
|   static Vector<Word> d_sum; |  | ||||||
|   d_sum.resize(1); |  | ||||||
|   Word *d_sum_p=&d_sum[0]; |  | ||||||
|   Word identity;  identity=0; |   Word identity;  identity=0; | ||||||
|   d_sum[0] = identity; |   Word ret = 0; | ||||||
|   const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); |   {  | ||||||
|   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { |     sycl::buffer<Word, 1> abuff(&ret, {1}); | ||||||
|     auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList); |     theGridAccelerator->submit([&](sycl::handler &cgh) { | ||||||
|      cgh.parallel_for(cl::sycl::range<1>{L}, |       auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>()); | ||||||
| 		      Reduction, |       cgh.parallel_for(sycl::range<1>{L}, | ||||||
| 		      [=] (cl::sycl::id<1> index, auto &sum) { |                       Reduction, | ||||||
| 	 sum^=vec[index]; |                       [=] (sycl::id<1> index, auto &sum) { | ||||||
|      }); |                         sum ^=vec[index]; | ||||||
|    }); |                       }); | ||||||
|  |     }); | ||||||
|  |   } | ||||||
|   theGridAccelerator->wait(); |   theGridAccelerator->wait(); | ||||||
|   Word ret = d_sum[0]; |  | ||||||
|   //  free(d_sum,*theGridAccelerator); |  | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
| /* |  | ||||||
|  |  | ||||||
| template <class vobj> |  | ||||||
| inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites) |  | ||||||
| { |  | ||||||
|   typedef typename vobj::vector_type  vector; |  | ||||||
|   typedef typename vobj::scalar_type  scalar; |  | ||||||
|  |  | ||||||
|   typedef typename vobj::scalar_typeD scalarD; |  | ||||||
|   typedef typename vobj::scalar_objectD sobjD; |  | ||||||
|  |  | ||||||
|   sobjD ret; |  | ||||||
|   scalarD *ret_p = (scalarD *)&ret; |  | ||||||
|    |  | ||||||
|   const int nsimd = vobj::Nsimd(); |  | ||||||
|   const int words = sizeof(vobj)/sizeof(vector); |  | ||||||
|  |  | ||||||
|   Vector<scalar> buffer(osites*nsimd); |  | ||||||
|   scalar *buf = &buffer[0]; |  | ||||||
|   vector *dat = (vector *)lat; |  | ||||||
|  |  | ||||||
|   for(int w=0;w<words;w++) { |  | ||||||
|  |  | ||||||
|     accelerator_for(ss,osites,nsimd,{ |  | ||||||
| 	int lane = acceleratorSIMTlane(nsimd); |  | ||||||
| 	buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane); |  | ||||||
|     }); |  | ||||||
|     //Precision change at this point is to late to gain precision |  | ||||||
|     ret_p[w] = svm_reduce(buf,nsimd*osites); |  | ||||||
|   } |  | ||||||
|   return ret; |  | ||||||
| } |  | ||||||
| */ |  | ||||||
|   | |||||||
| @@ -21,9 +21,18 @@ NAMESPACE_BEGIN(Grid); | |||||||
|  |  | ||||||
|  |  | ||||||
| #if defined(GRID_CUDA) || defined(GRID_HIP) | #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||||
| template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { | template<class vobj> | ||||||
|  | inline void sliceSumReduction_cub_small(const vobj *Data, | ||||||
|  | 					std::vector<vobj> &lvSum, | ||||||
|  | 					const int rd, | ||||||
|  | 					const int e1, | ||||||
|  | 					const int e2, | ||||||
|  | 					const int stride, | ||||||
|  | 					const int ostride, | ||||||
|  | 					const int Nsimd) | ||||||
|  | { | ||||||
|   size_t subvol_size = e1*e2; |   size_t subvol_size = e1*e2; | ||||||
|   commVector<vobj> reduction_buffer(rd*subvol_size); |   deviceVector<vobj> reduction_buffer(rd*subvol_size); | ||||||
|   auto rb_p = &reduction_buffer[0]; |   auto rb_p = &reduction_buffer[0]; | ||||||
|   vobj zero_init; |   vobj zero_init; | ||||||
|   zeroit(zero_init); |   zeroit(zero_init); | ||||||
| @@ -94,7 +103,15 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V | |||||||
|  |  | ||||||
|  |  | ||||||
| #if defined(GRID_SYCL) | #if defined(GRID_SYCL) | ||||||
| template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | template<class vobj> | ||||||
|  | inline void sliceSumReduction_sycl_small(const vobj *Data, | ||||||
|  | 					 std::vector <vobj> &lvSum, | ||||||
|  | 					 const int  &rd, | ||||||
|  | 					 const int &e1, | ||||||
|  | 					 const int &e2, | ||||||
|  | 					 const int &stride, | ||||||
|  | 					 const int &ostride, | ||||||
|  | 					 const int &Nsimd) | ||||||
| { | { | ||||||
|   size_t subvol_size = e1*e2; |   size_t subvol_size = e1*e2; | ||||||
|  |  | ||||||
| @@ -105,7 +122,7 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, | |||||||
|     mysum[r] = vobj_zero;  |     mysum[r] = vobj_zero;  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   commVector<vobj> reduction_buffer(rd*subvol_size);     |   deviceVector<vobj> reduction_buffer(rd*subvol_size);     | ||||||
|  |  | ||||||
|   auto rb_p = &reduction_buffer[0]; |   auto rb_p = &reduction_buffer[0]; | ||||||
|  |  | ||||||
| @@ -124,11 +141,11 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, | |||||||
|   }); |   }); | ||||||
|  |  | ||||||
|   for (int r = 0; r < rd; r++) { |   for (int r = 0; r < rd; r++) { | ||||||
|       theGridAccelerator->submit([&](cl::sycl::handler &cgh) { |       theGridAccelerator->submit([&](sycl::handler &cgh) { | ||||||
|           auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); |           auto Reduction = sycl::reduction(&mysum[r],std::plus<>()); | ||||||
|           cgh.parallel_for(cl::sycl::range<1>{subvol_size}, |           cgh.parallel_for(sycl::range<1>{subvol_size}, | ||||||
|           Reduction, |           Reduction, | ||||||
|           [=](cl::sycl::id<1> item, auto &sum) { |           [=](sycl::id<1> item, auto &sum) { | ||||||
|               auto s = item[0]; |               auto s = item[0]; | ||||||
|               sum += rb_p[r*subvol_size+s]; |               sum += rb_p[r*subvol_size+s]; | ||||||
|           }); |           }); | ||||||
| @@ -144,14 +161,23 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, | |||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { | template<class vobj> | ||||||
|  | inline void sliceSumReduction_large(const vobj *Data, | ||||||
|  | 				    std::vector<vobj> &lvSum, | ||||||
|  | 				    const int rd, | ||||||
|  | 				    const int e1, | ||||||
|  | 				    const int e2, | ||||||
|  | 				    const int stride, | ||||||
|  | 				    const int ostride, | ||||||
|  | 				    const int Nsimd) | ||||||
|  | { | ||||||
|   typedef typename vobj::vector_type vector; |   typedef typename vobj::vector_type vector; | ||||||
|   const int words = sizeof(vobj)/sizeof(vector); |   const int words = sizeof(vobj)/sizeof(vector); | ||||||
|   const int osites = rd*e1*e2; |   const int osites = rd*e1*e2; | ||||||
|   commVector<vector>buffer(osites); |   deviceVector<vector>buffer(osites); | ||||||
|   vector *dat = (vector *)Data; |   vector *dat = (vector *)Data; | ||||||
|   vector *buf = &buffer[0]; |   vector *buf = &buffer[0]; | ||||||
|   Vector<vector> lvSum_small(rd); |   std::vector<vector> lvSum_small(rd); | ||||||
|   vector *lvSum_ptr = (vector *)&lvSum[0]; |   vector *lvSum_ptr = (vector *)&lvSum[0]; | ||||||
|  |  | ||||||
|   for (int w = 0; w < words; w++) { |   for (int w = 0; w < words; w++) { | ||||||
| @@ -168,13 +194,18 @@ template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vecto | |||||||
|     for (int r = 0; r < rd; r++) { |     for (int r = 0; r < rd; r++) { | ||||||
|       lvSum_ptr[w+words*r]=lvSum_small[r]; |       lvSum_ptr[w+words*r]=lvSum_small[r]; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|    |  | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) | template<class vobj> | ||||||
|  | inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, | ||||||
|  | 				  std::vector<vobj> &lvSum, | ||||||
|  | 				  const int rd, | ||||||
|  | 				  const int e1, | ||||||
|  | 				  const int e2, | ||||||
|  | 				  const int stride, | ||||||
|  | 				  const int ostride, | ||||||
|  | 				  const int Nsimd) | ||||||
| { | { | ||||||
|   autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. |   autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. | ||||||
|     if constexpr (sizeof(vobj) <= 256) {  |     if constexpr (sizeof(vobj) <= 256) {  | ||||||
| @@ -192,7 +223,15 @@ template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data | |||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) | template<class vobj> | ||||||
|  | inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, | ||||||
|  | 				  std::vector<vobj> &lvSum, | ||||||
|  | 				  const int &rd, | ||||||
|  | 				  const int &e1, | ||||||
|  | 				  const int &e2, | ||||||
|  | 				  const int &stride, | ||||||
|  | 				  const int &ostride, | ||||||
|  | 				  const int &Nsimd) | ||||||
| { | { | ||||||
|   // sum over reduced dimension planes, breaking out orthog dir |   // sum over reduced dimension planes, breaking out orthog dir | ||||||
|   // Parallel over orthog direction |   // Parallel over orthog direction | ||||||
| @@ -208,16 +247,20 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data | |||||||
|   }); |   }); | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)  | template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, | ||||||
|  | 						   std::vector<vobj> &lvSum, | ||||||
|  | 						   const int &rd, | ||||||
|  | 						   const int &e1, | ||||||
|  | 						   const int &e2, | ||||||
|  | 						   const int &stride, | ||||||
|  | 						   const int &ostride, | ||||||
|  | 						   const int &Nsimd)  | ||||||
| { | { | ||||||
|   #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) | #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) | ||||||
|    |  | ||||||
|   sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |   sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|    | #else | ||||||
|   #else |  | ||||||
|   sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |   sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); | ||||||
|  | #endif | ||||||
|   #endif |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve | |||||||
|  * |  * | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
| template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf, | template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf, | ||||||
| 					      Lattice<vobj> &lat, | 					      Lattice<vobj> &lat, | ||||||
| 					      int x, | 					      int x, | ||||||
| 					      int dim, | 					      int dim, | ||||||
| @@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf, | |||||||
|   }); |   }); | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf, | template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf, | ||||||
| 					     const Lattice<vobj> &lat, | 					     const Lattice<vobj> &lat, | ||||||
| 					     int x, | 					     int x, | ||||||
| 					     int dim, | 					     int dim, | ||||||
| @@ -462,13 +462,19 @@ public: | |||||||
|     int rNsimd = Nsimd / simd[dimension]; |     int rNsimd = Nsimd / simd[dimension]; | ||||||
|     assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]); |     assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]); | ||||||
|  |  | ||||||
|     static cshiftVector<vobj> send_buf;  |     static deviceVector<vobj> send_buf;  | ||||||
|     static cshiftVector<vobj> recv_buf; |     static deviceVector<vobj> recv_buf; | ||||||
|     send_buf.resize(buffer_size*2*depth);     |     send_buf.resize(buffer_size*2*depth);     | ||||||
|     recv_buf.resize(buffer_size*2*depth); |     recv_buf.resize(buffer_size*2*depth); | ||||||
|  | #ifndef ACCELERATOR_AWARE_MPI | ||||||
|  |     static hostVector<vobj> hsend_buf;  | ||||||
|  |     static hostVector<vobj> hrecv_buf; | ||||||
|  |     hsend_buf.resize(buffer_size*2*depth);     | ||||||
|  |     hrecv_buf.resize(buffer_size*2*depth); | ||||||
|  | #endif     | ||||||
|  |  | ||||||
|     std::vector<CommsRequest_t> fwd_req;    |     std::vector<MpiCommsRequest_t> fwd_req;    | ||||||
|     std::vector<CommsRequest_t> bwd_req;    |     std::vector<MpiCommsRequest_t> bwd_req;    | ||||||
|  |  | ||||||
|     int words = buffer_size; |     int words = buffer_size; | ||||||
|     int bytes = words * sizeof(vobj); |     int bytes = words * sizeof(vobj); | ||||||
| @@ -495,9 +501,17 @@ public: | |||||||
|       t_gather+=usecond()-t; |       t_gather+=usecond()-t; | ||||||
|  |  | ||||||
|       t=usecond(); |       t=usecond(); | ||||||
|  | #ifdef ACCELERATOR_AWARE_MPI | ||||||
|       grid->SendToRecvFromBegin(fwd_req, |       grid->SendToRecvFromBegin(fwd_req, | ||||||
| 				(void *)&send_buf[d*buffer_size], xmit_to_rank, | 				(void *)&send_buf[d*buffer_size], xmit_to_rank, | ||||||
| 				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag); | 				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag); | ||||||
|  | #else | ||||||
|  |       acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes); | ||||||
|  |       grid->SendToRecvFromBegin(fwd_req, | ||||||
|  | 				(void *)&hsend_buf[d*buffer_size], xmit_to_rank, | ||||||
|  | 				(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag); | ||||||
|  |       acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes); | ||||||
|  | #endif | ||||||
|       t_comms+=usecond()-t; |       t_comms+=usecond()-t; | ||||||
|      } |      } | ||||||
|     for ( int d=0;d < depth ; d ++ ) { |     for ( int d=0;d < depth ; d ++ ) { | ||||||
| @@ -508,9 +522,17 @@ public: | |||||||
|       t_gather+= usecond() - t; |       t_gather+= usecond() - t; | ||||||
|  |  | ||||||
|       t=usecond(); |       t=usecond(); | ||||||
|  | #ifdef ACCELERATOR_AWARE_MPI | ||||||
|       grid->SendToRecvFromBegin(bwd_req, |       grid->SendToRecvFromBegin(bwd_req, | ||||||
| 				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank, | 				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank, | ||||||
| 				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag); | 				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag); | ||||||
|  | #else | ||||||
|  |       acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes); | ||||||
|  |       grid->SendToRecvFromBegin(bwd_req, | ||||||
|  | 				(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank, | ||||||
|  | 				(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag); | ||||||
|  |       acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes); | ||||||
|  | #endif       | ||||||
|       t_comms+=usecond()-t; |       t_comms+=usecond()-t; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -98,7 +98,7 @@ public: | |||||||
|   virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action |   virtual RealD S(const GaugeField& U) = 0;                             // evaluate the action | ||||||
|   virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ? |   virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ;  // if the refresh computes the action, can cache it. Alternately refreshAndAction() ? | ||||||
|   virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative |   virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0;        // evaluate the action derivative | ||||||
|  |   | ||||||
|   ///////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////// | ||||||
|   // virtual smeared interface through configuration container |   // virtual smeared interface through configuration container | ||||||
|   ///////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////// | ||||||
| @@ -132,6 +132,10 @@ public: | |||||||
| template <class GaugeField > | template <class GaugeField > | ||||||
| class EmptyAction : public Action <GaugeField> | class EmptyAction : public Action <GaugeField> | ||||||
| { | { | ||||||
|  |   using Action<GaugeField>::refresh; | ||||||
|  |   using Action<GaugeField>::Sinitial; | ||||||
|  |   using Action<GaugeField>::deriv; | ||||||
|  |  | ||||||
|   virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions |   virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions | ||||||
|   virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action |   virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action | ||||||
|   virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative |   virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative | ||||||
|   | |||||||
| @@ -55,6 +55,11 @@ public: | |||||||
|   RealD alpha; // Mobius scale |   RealD alpha; // Mobius scale | ||||||
|   RealD k;     // EOFA normalization constant |   RealD k;     // EOFA normalization constant | ||||||
|  |  | ||||||
|  |   // Device resident | ||||||
|  |   deviceVector<Coeff_t> d_shift_coefficients; | ||||||
|  |   deviceVector<Coeff_t> d_MooeeInv_shift_lc; | ||||||
|  |   deviceVector<Coeff_t> d_MooeeInv_shift_norm; | ||||||
|  |    | ||||||
|   virtual void Instantiatable(void) = 0; |   virtual void Instantiatable(void) = 0; | ||||||
|  |  | ||||||
|   // EOFA-specific operations |   // EOFA-specific operations | ||||||
| @@ -92,6 +97,11 @@ public: | |||||||
|     this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) / |     this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) / | ||||||
|       ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) / |       ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) / | ||||||
|       ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) ); |       ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) ); | ||||||
|  |      | ||||||
|  |     d_shift_coefficients.resize(Ls); | ||||||
|  |     d_MooeeInv_shift_lc.resize(Ls); | ||||||
|  |     d_MooeeInv_shift_norm.resize(Ls); | ||||||
|  |  | ||||||
|   }; |   }; | ||||||
| }; | }; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -90,16 +90,16 @@ public: | |||||||
|   void M5D(const FermionField &psi, |   void M5D(const FermionField &psi, | ||||||
| 	   const FermionField &phi, | 	   const FermionField &phi, | ||||||
| 	   FermionField &chi, | 	   FermionField &chi, | ||||||
| 	   Vector<Coeff_t> &lower, | 	   std::vector<Coeff_t> &lower, | ||||||
| 	   Vector<Coeff_t> &diag, | 	   std::vector<Coeff_t> &diag, | ||||||
| 	   Vector<Coeff_t> &upper); | 	   std::vector<Coeff_t> &upper); | ||||||
|  |  | ||||||
|   void M5Ddag(const FermionField &psi, |   void M5Ddag(const FermionField &psi, | ||||||
| 	      const FermionField &phi, | 	      const FermionField &phi, | ||||||
| 	      FermionField &chi, | 	      FermionField &chi, | ||||||
| 	      Vector<Coeff_t> &lower, | 	      std::vector<Coeff_t> &lower, | ||||||
| 	      Vector<Coeff_t> &diag, | 	      std::vector<Coeff_t> &diag, | ||||||
| 	      Vector<Coeff_t> &upper); | 	      std::vector<Coeff_t> &upper); | ||||||
|  |  | ||||||
|   virtual void   Instantiatable(void)=0; |   virtual void   Instantiatable(void)=0; | ||||||
|  |  | ||||||
| @@ -119,35 +119,51 @@ public: | |||||||
|   RealD mass_plus, mass_minus; |   RealD mass_plus, mass_minus; | ||||||
|  |  | ||||||
|   // Save arguments to SetCoefficientsInternal |   // Save arguments to SetCoefficientsInternal | ||||||
|   Vector<Coeff_t> _gamma; |   std::vector<Coeff_t> _gamma; | ||||||
|   RealD                _zolo_hi; |   RealD                _zolo_hi; | ||||||
|   RealD                _b; |   RealD                _b; | ||||||
|   RealD                _c; |   RealD                _c; | ||||||
|  |  | ||||||
|  |   // possible boost | ||||||
|  |   std::vector<ComplexD> qmu; | ||||||
|  |   void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);}; | ||||||
|  |   void addQmu(const FermionField &in, FermionField &out, int dag); | ||||||
|  |    | ||||||
|   // Cayley form Moebius (tanh and zolotarev) |   // Cayley form Moebius (tanh and zolotarev) | ||||||
|   Vector<Coeff_t> omega; |   std::vector<Coeff_t> omega; | ||||||
|   Vector<Coeff_t> bs;    // S dependent coeffs |   std::vector<Coeff_t> bs;    // S dependent coeffs | ||||||
|   Vector<Coeff_t> cs; |   std::vector<Coeff_t> cs; | ||||||
|   Vector<Coeff_t> as; |   std::vector<Coeff_t> as; | ||||||
|   // For preconditioning Cayley form |   // For preconditioning Cayley form | ||||||
|   Vector<Coeff_t> bee; |   std::vector<Coeff_t> bee; | ||||||
|   Vector<Coeff_t> cee; |   std::vector<Coeff_t> cee; | ||||||
|   Vector<Coeff_t> aee; |   std::vector<Coeff_t> aee; | ||||||
|   Vector<Coeff_t> beo; |   std::vector<Coeff_t> beo; | ||||||
|   Vector<Coeff_t> ceo; |   std::vector<Coeff_t> ceo; | ||||||
|   Vector<Coeff_t> aeo; |   std::vector<Coeff_t> aeo; | ||||||
|   // LDU factorisation of the eeoo matrix |   // LDU factorisation of the eeoo matrix | ||||||
|   Vector<Coeff_t> lee; |   std::vector<Coeff_t> lee; | ||||||
|   Vector<Coeff_t> leem; |   std::vector<Coeff_t> leem; | ||||||
|   Vector<Coeff_t> uee; |   std::vector<Coeff_t> uee; | ||||||
|   Vector<Coeff_t> ueem; |   std::vector<Coeff_t> ueem; | ||||||
|   Vector<Coeff_t> dee; |   std::vector<Coeff_t> dee; | ||||||
|  |  | ||||||
|  |   // Device memory | ||||||
|  |   deviceVector<Coeff_t> d_diag; | ||||||
|  |   deviceVector<Coeff_t> d_upper; | ||||||
|  |   deviceVector<Coeff_t> d_lower; | ||||||
|  |  | ||||||
|  |   deviceVector<Coeff_t> d_lee; | ||||||
|  |   deviceVector<Coeff_t> d_dee; | ||||||
|  |   deviceVector<Coeff_t> d_uee; | ||||||
|  |   deviceVector<Coeff_t> d_leem; | ||||||
|  |   deviceVector<Coeff_t> d_ueem; | ||||||
|  |  | ||||||
|   // Matrices of 5d ee inverse params |   // Matrices of 5d ee inverse params | ||||||
|   Vector<iSinglet<Simd> >  MatpInv; |   //  std::vector<iSinglet<Simd> >  MatpInv; | ||||||
|   Vector<iSinglet<Simd> >  MatmInv; |   //  std::vector<iSinglet<Simd> >  MatmInv; | ||||||
|   Vector<iSinglet<Simd> >  MatpInvDag; |   //  std::vector<iSinglet<Simd> >  MatpInvDag; | ||||||
|   Vector<iSinglet<Simd> >  MatmInvDag; |   //  std::vector<iSinglet<Simd> >  MatmInvDag; | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////// | ||||||
|   // Conserved current utilities |   // Conserved current utilities | ||||||
| @@ -187,7 +203,7 @@ public: | |||||||
| protected: | protected: | ||||||
|   virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); |   virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); | ||||||
|   virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c); |   virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c); | ||||||
|   virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c); |   virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c); | ||||||
| }; | }; | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|   | |||||||
| @@ -60,6 +60,50 @@ public: | |||||||
|   //      virtual void   Instantiatable(void)=0; |   //      virtual void   Instantiatable(void)=0; | ||||||
|   virtual void   Instantiatable(void) =0; |   virtual void   Instantiatable(void) =0; | ||||||
|  |  | ||||||
|  |   void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist) | ||||||
|  |   { | ||||||
|  |     std::cout << "Free Propagator for PartialFraction"<<std::endl; | ||||||
|  |     FermionField in_k(in.Grid()); | ||||||
|  |     FermionField prop_k(in.Grid()); | ||||||
|  |      | ||||||
|  |     FFT theFFT((GridCartesian *) in.Grid()); | ||||||
|  |  | ||||||
|  |     //phase for boundary condition | ||||||
|  |     ComplexField coor(in.Grid()); | ||||||
|  |     ComplexField ph(in.Grid());  ph = Zero(); | ||||||
|  |     FermionField in_buf(in.Grid()); in_buf = Zero(); | ||||||
|  |     typedef typename Simd::scalar_type Scalar; | ||||||
|  |     Scalar ci(0.0,1.0); | ||||||
|  |     assert(twist.size() == Nd);//check that twist is Nd | ||||||
|  |     assert(boundary.size() == Nd);//check that boundary conditions is Nd | ||||||
|  |     int shift = 0; | ||||||
|  |     for(unsigned int nu = 0; nu < Nd; nu++) | ||||||
|  |       { | ||||||
|  | 	// Shift coordinate lattice index by 1 to account for 5th dimension. | ||||||
|  | 	LatticeCoordinate(coor, nu + shift); | ||||||
|  | 	double boundary_phase = ::acos(real(boundary[nu])); | ||||||
|  | 	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift]))); | ||||||
|  | 	//momenta for propagator shifted by twist+boundary | ||||||
|  | 	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI)); | ||||||
|  |       } | ||||||
|  |     in_buf = exp(ci*ph*(-1.0))*in; | ||||||
|  |  | ||||||
|  |     theFFT.FFT_all_dim(in_k,in,FFT::forward); | ||||||
|  |     this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist); | ||||||
|  |     theFFT.FFT_all_dim(out,prop_k,FFT::backward); | ||||||
|  |      | ||||||
|  |     //phase for boundary condition | ||||||
|  |     out = out * exp(ci*ph); | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { | ||||||
|  |     std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions | ||||||
|  |     std::vector<Complex> boundary; | ||||||
|  |     for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions | ||||||
|  |     FreePropagator(in,out,mass,boundary,twist); | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |    | ||||||
|   // Efficient support for multigrid coarsening |   // Efficient support for multigrid coarsening | ||||||
|   virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp); |   virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp); | ||||||
|   virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out); |   virtual void  MdirAll(const FermionField &in, std::vector<FermionField> &out); | ||||||
| @@ -90,12 +134,12 @@ protected: | |||||||
|   RealD mass; |   RealD mass; | ||||||
|   RealD R; |   RealD R; | ||||||
|   RealD ZoloHiInv; |   RealD ZoloHiInv; | ||||||
|   Vector<double> Beta; |   std::vector<double> Beta; | ||||||
|   Vector<double> cc;; |   std::vector<double> cc;; | ||||||
|   Vector<double> cc_d;; |   std::vector<double> cc_d;; | ||||||
|   Vector<double> sqrt_cc; |   std::vector<double> sqrt_cc; | ||||||
|   Vector<double> See; |   std::vector<double> See; | ||||||
|   Vector<double> Aee; |   std::vector<double> Aee; | ||||||
|  |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -69,10 +69,10 @@ public: | |||||||
|   // Instantiate different versions depending on Impl |   // Instantiate different versions depending on Impl | ||||||
|   ///////////////////////////////////////////////////// |   ///////////////////////////////////////////////////// | ||||||
|   void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, |   void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, | ||||||
| 	   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper); | 	   std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); | ||||||
|  |  | ||||||
|   void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, |   void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, | ||||||
| 	      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper); | 	      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); | ||||||
|  |  | ||||||
|   virtual void RefreshShiftCoefficients(RealD new_shift); |   virtual void RefreshShiftCoefficients(RealD new_shift); | ||||||
|  |  | ||||||
| @@ -83,7 +83,7 @@ public: | |||||||
| 			RealD _M5, const ImplParams& p=ImplParams()); | 			RealD _M5, const ImplParams& p=ImplParams()); | ||||||
|  |  | ||||||
| protected: | protected: | ||||||
|   void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c); |   void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c); | ||||||
| }; | }; | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|   | |||||||
| @@ -102,11 +102,11 @@ public: | |||||||
| 		     GaugeField &mat,  | 		     GaugeField &mat,  | ||||||
| 		     const FermionField &A, const FermionField &B, int dag); | 		     const FermionField &A, const FermionField &B, int dag); | ||||||
|  |  | ||||||
|   void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, |   void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU, | ||||||
|                     const FermionField &in, FermionField &out, int dag); |                     const FermionField &in, FermionField &out, int dag); | ||||||
|   void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, |   void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU, | ||||||
|                     const FermionField &in, FermionField &out, int dag); |                     const FermionField &in, FermionField &out, int dag); | ||||||
|   void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, |   void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU, | ||||||
|                     const FermionField &in, FermionField &out, int dag); |                     const FermionField &in, FermionField &out, int dag); | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -164,8 +164,6 @@ public: | |||||||
|   DoubledGaugeField UUUmuEven; |   DoubledGaugeField UUUmuEven; | ||||||
|   DoubledGaugeField UUUmuOdd; |   DoubledGaugeField UUUmuOdd; | ||||||
|  |  | ||||||
|   LebesgueOrder Lebesgue; |  | ||||||
|   LebesgueOrder LebesgueEvenOdd; |  | ||||||
|    |    | ||||||
|   /////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////// | ||||||
|   // Conserved current utilities |   // Conserved current utilities | ||||||
|   | |||||||
| @@ -100,7 +100,6 @@ public: | |||||||
| 		     int dag); | 		     int dag); | ||||||
|      |      | ||||||
|   void DhopInternal(StencilImpl & st, |   void DhopInternal(StencilImpl & st, | ||||||
| 		    LebesgueOrder &lo, |  | ||||||
| 		    DoubledGaugeField &U, | 		    DoubledGaugeField &U, | ||||||
| 		    DoubledGaugeField &UUU, | 		    DoubledGaugeField &UUU, | ||||||
| 		    const FermionField &in,  | 		    const FermionField &in,  | ||||||
| @@ -108,7 +107,6 @@ public: | |||||||
| 		    int dag); | 		    int dag); | ||||||
|      |      | ||||||
|     void DhopInternalOverlappedComms(StencilImpl & st, |     void DhopInternalOverlappedComms(StencilImpl & st, | ||||||
| 		      LebesgueOrder &lo, |  | ||||||
| 		      DoubledGaugeField &U, | 		      DoubledGaugeField &U, | ||||||
| 		      DoubledGaugeField &UUU, | 		      DoubledGaugeField &UUU, | ||||||
| 		      const FermionField &in,  | 		      const FermionField &in,  | ||||||
| @@ -116,7 +114,6 @@ public: | |||||||
| 		      int dag); | 		      int dag); | ||||||
|  |  | ||||||
|     void DhopInternalSerialComms(StencilImpl & st, |     void DhopInternalSerialComms(StencilImpl & st, | ||||||
| 		      LebesgueOrder &lo, |  | ||||||
| 		      DoubledGaugeField &U, | 		      DoubledGaugeField &U, | ||||||
| 		      DoubledGaugeField &UUU, | 		      DoubledGaugeField &UUU, | ||||||
| 		      const FermionField &in,  | 		      const FermionField &in,  | ||||||
| @@ -192,8 +189,6 @@ public: | |||||||
|   DoubledGaugeField UUUmuEven; |   DoubledGaugeField UUUmuEven; | ||||||
|   DoubledGaugeField UUUmuOdd; |   DoubledGaugeField UUUmuOdd; | ||||||
|      |      | ||||||
|   LebesgueOrder Lebesgue; |  | ||||||
|   LebesgueOrder LebesgueEvenOdd; |  | ||||||
|      |      | ||||||
|   // Comms buffer |   // Comms buffer | ||||||
|   //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf; |   //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf; | ||||||
|   | |||||||
| @@ -42,11 +42,11 @@ public: | |||||||
|  |  | ||||||
| public: | public: | ||||||
|   // Shift operator coefficients for red-black preconditioned Mobius EOFA |   // Shift operator coefficients for red-black preconditioned Mobius EOFA | ||||||
|   Vector<Coeff_t> Mooee_shift; |   std::vector<Coeff_t> Mooee_shift; | ||||||
|   Vector<Coeff_t> MooeeInv_shift_lc; |   std::vector<Coeff_t> MooeeInv_shift_lc; | ||||||
|   Vector<Coeff_t> MooeeInv_shift_norm; |   std::vector<Coeff_t> MooeeInv_shift_norm; | ||||||
|   Vector<Coeff_t> MooeeInvDag_shift_lc; |   std::vector<Coeff_t> MooeeInvDag_shift_lc; | ||||||
|   Vector<Coeff_t> MooeeInvDag_shift_norm; |   std::vector<Coeff_t> MooeeInvDag_shift_norm; | ||||||
|  |  | ||||||
|   virtual void Instantiatable(void) {}; |   virtual void Instantiatable(void) {}; | ||||||
|  |  | ||||||
| @@ -74,18 +74,18 @@ public: | |||||||
|   // Instantiate different versions depending on Impl |   // Instantiate different versions depending on Impl | ||||||
|   ///////////////////////////////////////////////////// |   ///////////////////////////////////////////////////// | ||||||
|   void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, |   void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, | ||||||
| 	   Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper); | 	   std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); | ||||||
|  |  | ||||||
|   void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, |   void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, | ||||||
| 		 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper, | 		 std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, | ||||||
| 		 Vector<Coeff_t>& shift_coeffs); | 		 std::vector<Coeff_t>& shift_coeffs); | ||||||
|  |  | ||||||
|   void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, |   void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, | ||||||
| 	      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper); | 	      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); | ||||||
|  |  | ||||||
|   void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, |   void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, | ||||||
| 		    Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper, | 		    std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, | ||||||
| 		    Vector<Coeff_t>& shift_coeffs); | 		    std::vector<Coeff_t>& shift_coeffs); | ||||||
|  |  | ||||||
|   virtual void RefreshShiftCoefficients(RealD new_shift); |   virtual void RefreshShiftCoefficients(RealD new_shift); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -102,11 +102,11 @@ public: | |||||||
| 		     GaugeField &mat,  | 		     GaugeField &mat,  | ||||||
| 		     const FermionField &A, const FermionField &B, int dag); | 		     const FermionField &A, const FermionField &B, int dag); | ||||||
|  |  | ||||||
|   void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, |   void DhopInternal(StencilImpl &st, DoubledGaugeField &U, | ||||||
|                     const FermionField &in, FermionField &out, int dag); |                     const FermionField &in, FermionField &out, int dag); | ||||||
|   void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, |   void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U, | ||||||
| 			       const FermionField &in, FermionField &out, int dag); | 			       const FermionField &in, FermionField &out, int dag); | ||||||
|   void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, |   void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U, | ||||||
| 				   const FermionField &in, FermionField &out, int dag); | 				   const FermionField &in, FermionField &out, int dag); | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -152,9 +152,6 @@ public: | |||||||
|   DoubledGaugeField UmuEven; |   DoubledGaugeField UmuEven; | ||||||
|   DoubledGaugeField UmuOdd; |   DoubledGaugeField UmuOdd; | ||||||
|  |  | ||||||
|   LebesgueOrder Lebesgue; |  | ||||||
|   LebesgueOrder LebesgueEvenOdd; |  | ||||||
|    |  | ||||||
|   /////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////// | ||||||
|   // Conserved current utilities |   // Conserved current utilities | ||||||
|   /////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -42,7 +42,7 @@ public: | |||||||
|  |  | ||||||
|      void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { |      void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { | ||||||
|        this->MomentumSpacePropagatorHw(out,in,_m,twist); |        this->MomentumSpacePropagatorHw(out,in,_m,twist); | ||||||
|   }; |      }; | ||||||
|  |  | ||||||
|   // Constructors |   // Constructors | ||||||
|   OverlapWilsonCayleyTanhFermion(GaugeField &_Umu, |   OverlapWilsonCayleyTanhFermion(GaugeField &_Umu, | ||||||
|   | |||||||
| @@ -41,6 +41,10 @@ public: | |||||||
| public: | public: | ||||||
|  |  | ||||||
|   // Constructors |   // Constructors | ||||||
|  |   virtual void   Instantiatable(void){}; | ||||||
|  |   void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { | ||||||
|  |     this->MomentumSpacePropagatorHw(out,in,_m,twist); | ||||||
|  |   }; | ||||||
|  |  | ||||||
|   OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu, |   OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu, | ||||||
| 				      GridCartesian         &FiveDimGrid, | 				      GridCartesian         &FiveDimGrid, | ||||||
|   | |||||||
| @@ -41,6 +41,9 @@ public: | |||||||
| public: | public: | ||||||
|  |  | ||||||
|   virtual void   Instantiatable(void){}; |   virtual void   Instantiatable(void){}; | ||||||
|  |   void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { | ||||||
|  |     this->MomentumSpacePropagatorHw(out,in,_m,twist); | ||||||
|  |   }; | ||||||
|   // Constructors |   // Constructors | ||||||
|   OverlapWilsonContFracTanhFermion(GaugeField &_Umu, |   OverlapWilsonContFracTanhFermion(GaugeField &_Umu, | ||||||
| 				   GridCartesian         &FiveDimGrid, | 				   GridCartesian         &FiveDimGrid, | ||||||
|   | |||||||
| @@ -40,6 +40,9 @@ public: | |||||||
|   INHERIT_IMPL_TYPES(Impl); |   INHERIT_IMPL_TYPES(Impl); | ||||||
|  |  | ||||||
|   virtual void   Instantiatable(void){}; |   virtual void   Instantiatable(void){}; | ||||||
|  |   void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { | ||||||
|  |     this->MomentumSpacePropagatorHw(out,in,_m,twist); | ||||||
|  |   }; | ||||||
|   // Constructors |   // Constructors | ||||||
|   OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu, |   OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu, | ||||||
| 					GridCartesian         &FiveDimGrid, | 					GridCartesian         &FiveDimGrid, | ||||||
|   | |||||||
| @@ -41,6 +41,9 @@ public: | |||||||
| public: | public: | ||||||
|  |  | ||||||
|   virtual void   Instantiatable(void){}; |   virtual void   Instantiatable(void){}; | ||||||
|  |   void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { | ||||||
|  |     this->MomentumSpacePropagatorHw(out,in,_m,twist); | ||||||
|  |   }; | ||||||
|   // Constructors |   // Constructors | ||||||
|   OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu, |   OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu, | ||||||
| 					  GridCartesian         &FiveDimGrid, | 					  GridCartesian         &FiveDimGrid, | ||||||
|   | |||||||
| @@ -40,6 +40,11 @@ public: | |||||||
|   INHERIT_IMPL_TYPES(Impl); |   INHERIT_IMPL_TYPES(Impl); | ||||||
|  |  | ||||||
|   virtual void   Instantiatable(void){}; |   virtual void   Instantiatable(void){}; | ||||||
|  |  | ||||||
|  |   void  MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { | ||||||
|  |     this->MomentumSpacePropagatorHw(out,in,_m,twist); | ||||||
|  |   }; | ||||||
|  |  | ||||||
|   // Constructors |   // Constructors | ||||||
|   OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu, |   OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu, | ||||||
| 					       GridCartesian         &FiveDimGrid, | 					       GridCartesian         &FiveDimGrid, | ||||||
|   | |||||||
| @@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl> | |||||||
| public: | public: | ||||||
|   INHERIT_IMPL_TYPES(Impl); |   INHERIT_IMPL_TYPES(Impl); | ||||||
|  |  | ||||||
|   const int part_frac_chroma_convention=1; |   const int part_frac_chroma_convention=0; | ||||||
|  |  | ||||||
|   void   Meooe_internal(const FermionField &in, FermionField &out,int dag); |   void   Meooe_internal(const FermionField &in, FermionField &out,int dag); | ||||||
|   void   Mooee_internal(const FermionField &in, FermionField &out,int dag); |   void   Mooee_internal(const FermionField &in, FermionField &out,int dag); | ||||||
| @@ -83,19 +83,78 @@ public: | |||||||
| 			   GridRedBlackCartesian &FourDimRedBlackGrid, | 			   GridRedBlackCartesian &FourDimRedBlackGrid, | ||||||
| 			   RealD _mass,RealD M5,const ImplParams &p= ImplParams()); | 			   RealD _mass,RealD M5,const ImplParams &p= ImplParams()); | ||||||
|  |  | ||||||
|  |   PartialFractionFermion5D(GaugeField &_Umu, | ||||||
|  | 			   GridCartesian         &FiveDimGrid, | ||||||
|  | 			   GridRedBlackCartesian &FiveDimRedBlackGrid, | ||||||
|  | 			   GridCartesian         &FourDimGrid, | ||||||
|  | 			   GridRedBlackCartesian &FourDimRedBlackGrid, | ||||||
|  | 			   RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams()); | ||||||
|  |  | ||||||
|  |   void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist) | ||||||
|  |   { | ||||||
|  |     std::cout << "Free Propagator for PartialFraction"<<std::endl; | ||||||
|  |     FermionField in_k(in.Grid()); | ||||||
|  |     FermionField prop_k(in.Grid()); | ||||||
|  |      | ||||||
|  |     FFT theFFT((GridCartesian *) in.Grid()); | ||||||
|  |  | ||||||
|  |     //phase for boundary condition | ||||||
|  |     ComplexField coor(in.Grid()); | ||||||
|  |     ComplexField ph(in.Grid());  ph = Zero(); | ||||||
|  |     FermionField in_buf(in.Grid()); in_buf = Zero(); | ||||||
|  |     typedef typename Simd::scalar_type Scalar; | ||||||
|  |     Scalar ci(0.0,1.0); | ||||||
|  |     assert(twist.size() == Nd);//check that twist is Nd | ||||||
|  |     assert(boundary.size() == Nd);//check that boundary conditions is Nd | ||||||
|  |     int shift = 0; | ||||||
|  |     for(unsigned int nu = 0; nu < Nd; nu++) | ||||||
|  |       { | ||||||
|  | 	// Shift coordinate lattice index by 1 to account for 5th dimension. | ||||||
|  | 	LatticeCoordinate(coor, nu + shift); | ||||||
|  | 	double boundary_phase = ::acos(real(boundary[nu])); | ||||||
|  | 	ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift]))); | ||||||
|  | 	//momenta for propagator shifted by twist+boundary | ||||||
|  | 	twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI)); | ||||||
|  |       } | ||||||
|  |     in_buf = exp(ci*ph*(-1.0))*in; | ||||||
|  |  | ||||||
|  |     theFFT.FFT_all_dim(in_k,in,FFT::forward); | ||||||
|  |     if ( this->qmu.size() ){ | ||||||
|  |       this->MomentumSpacePropagatorHwQ(prop_k,in_k,mass,twist,this->qmu); | ||||||
|  |     } else { | ||||||
|  |       this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist); | ||||||
|  |     } | ||||||
|  |     theFFT.FFT_all_dim(out,prop_k,FFT::backward); | ||||||
|  |      | ||||||
|  |     //phase for boundary condition | ||||||
|  |     out = out * exp(ci*ph); | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { | ||||||
|  |     std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions | ||||||
|  |     std::vector<Complex> boundary; | ||||||
|  |     for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions | ||||||
|  |     FreePropagator(in,out,mass,boundary,twist); | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);}; | ||||||
|  |   void addQmu(const FermionField &in, FermionField &out, int dag); | ||||||
|  |  | ||||||
| protected: | protected: | ||||||
|  |  | ||||||
|   virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale); |   virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale); | ||||||
|   virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata); |   virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata); | ||||||
|  |  | ||||||
|  |   std::vector<RealD> qmu; | ||||||
|  |  | ||||||
|   // Part frac |   // Part frac | ||||||
|   RealD mass; |   RealD mass; | ||||||
|   RealD dw_diag; |   RealD dw_diag; | ||||||
|   RealD R; |   RealD R; | ||||||
|   RealD amax; |   RealD amax; | ||||||
|   RealD scale; |   RealD scale; | ||||||
|   Vector<double> p;  |   std::vector<double> p;  | ||||||
|   Vector<double> q; |   std::vector<double> q; | ||||||
|  |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -35,7 +35,7 @@ template<class Matrix, class Field> | |||||||
| class KappaSimilarityTransform { | class KappaSimilarityTransform { | ||||||
| public: | public: | ||||||
|   INHERIT_IMPL_TYPES(Matrix); |   INHERIT_IMPL_TYPES(Matrix); | ||||||
|   Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag; |   std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag; | ||||||
|  |  | ||||||
|   KappaSimilarityTransform (Matrix &zmob) { |   KappaSimilarityTransform (Matrix &zmob) { | ||||||
|     for (int i=0;i<(int)zmob.bs.size();i++) { |     for (int i=0;i<(int)zmob.bs.size();i++) { | ||||||
|   | |||||||
| @@ -49,10 +49,10 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub | |||||||
|     |     | ||||||
|  public: |  public: | ||||||
|  |  | ||||||
|   void DhopImproved(StencilImpl &st, LebesgueOrder &lo,  |   void DhopImproved(StencilImpl &st, | ||||||
| 		    DoubledGaugeField &U, DoubledGaugeField &UUU,  | 		    DoubledGaugeField &U, DoubledGaugeField &UUU,  | ||||||
| 		    const FermionField &in, FermionField &out, int dag, int interior,int exterior); | 		    const FermionField &in, FermionField &out, int dag, int interior,int exterior); | ||||||
|   void DhopNaive(StencilImpl &st, LebesgueOrder &lo,  |   void DhopNaive(StencilImpl &st, | ||||||
| 		 DoubledGaugeField &U, | 		 DoubledGaugeField &U, | ||||||
| 		 const FermionField &in, FermionField &out, int dag, int interior,int exterior); | 		 const FermionField &in, FermionField &out, int dag, int interior,int exterior); | ||||||
|    |    | ||||||
|   | |||||||
| @@ -47,7 +47,7 @@ public: | |||||||
|   static int PartialCompressionFactor(GridBase *grid) { return 1;} |   static int PartialCompressionFactor(GridBase *grid) { return 1;} | ||||||
| #endif | #endif | ||||||
|   template<class vobj,class cobj,class compressor> |   template<class vobj,class cobj,class compressor> | ||||||
|   static void Gather_plane_simple (commVector<std::pair<int,int> >& table, |   static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table, | ||||||
| 				   const Lattice<vobj> &rhs, | 				   const Lattice<vobj> &rhs, | ||||||
| 				   cobj *buffer, | 				   cobj *buffer, | ||||||
| 				   compressor &compress, | 				   compressor &compress, | ||||||
| @@ -109,7 +109,7 @@ public: | |||||||
|   // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2. |   // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2. | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   template<class vobj,class cobj,class compressor> |   template<class vobj,class cobj,class compressor> | ||||||
|   static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs, |   static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs, | ||||||
| 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask, | 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask, | ||||||
| 				    compressor &compress,int type,int partial) | 				    compressor &compress,int type,int partial) | ||||||
|   { |   { | ||||||
| @@ -197,7 +197,7 @@ public: | |||||||
| #endif | #endif | ||||||
|    |    | ||||||
|   template<class vobj,class cobj,class compressor> |   template<class vobj,class cobj,class compressor> | ||||||
|   static void Gather_plane_simple (commVector<std::pair<int,int> >& table, |   static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table, | ||||||
| 					 const Lattice<vobj> &rhs, | 					 const Lattice<vobj> &rhs, | ||||||
| 					 cobj *buffer, | 					 cobj *buffer, | ||||||
| 					 compressor &compress, | 					 compressor &compress, | ||||||
| @@ -208,7 +208,7 @@ public: | |||||||
|     else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial); |     else        FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial); | ||||||
|   } |   } | ||||||
|   template<class vobj,class cobj,class compressor> |   template<class vobj,class cobj,class compressor> | ||||||
|   static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs, |   static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs, | ||||||
| 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask, | 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask, | ||||||
| 				    compressor &compress,int type,int partial) | 				    compressor &compress,int type,int partial) | ||||||
|   { |   { | ||||||
| @@ -402,7 +402,6 @@ public: | |||||||
|  |  | ||||||
|   typedef CartesianStencil<vobj,cobj,Parameters> Base; |   typedef CartesianStencil<vobj,cobj,Parameters> Base; | ||||||
|   typedef typename Base::View_type View_type; |   typedef typename Base::View_type View_type; | ||||||
|   typedef typename Base::StencilVector StencilVector; |  | ||||||
|  |  | ||||||
|   //  Vector<int> surface_list; |   //  Vector<int> surface_list; | ||||||
|   WilsonStencil(GridBase *grid, |   WilsonStencil(GridBase *grid, | ||||||
| @@ -415,29 +414,6 @@ public: | |||||||
|     //    surface_list.resize(0); |     //    surface_list.resize(0); | ||||||
|     this->same_node.resize(npoints); |     this->same_node.resize(npoints); | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   /* |  | ||||||
|   void BuildSurfaceList(int Ls,int vol4){ |  | ||||||
|  |  | ||||||
|     // find same node for SHM |  | ||||||
|     // Here we know the distance is 1 for WilsonStencil |  | ||||||
|     for(int point=0;point<this->_npoints;point++){ |  | ||||||
|       this->same_node[point] = this->SameNode(point); |  | ||||||
|     } |  | ||||||
|      |  | ||||||
|     for(int site = 0 ;site< vol4;site++){ |  | ||||||
|       int local = 1; |  | ||||||
|       for(int point=0;point<this->_npoints;point++){ |  | ||||||
| 	if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){  |  | ||||||
| 	  local = 0; |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
|       if(local == 0) {  |  | ||||||
| 	surface_list.push_back(site); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   */ |  | ||||||
|    |    | ||||||
|   template < class compressor> |   template < class compressor> | ||||||
|   void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)  |   void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)  | ||||||
| @@ -508,6 +484,12 @@ public: | |||||||
|     this->face_table_computed=1; |     this->face_table_computed=1; | ||||||
|     assert(this->u_comm_offset==this->_unified_buffer_size); |     assert(this->u_comm_offset==this->_unified_buffer_size); | ||||||
|     accelerator_barrier(); |     accelerator_barrier(); | ||||||
|  | #ifdef NVLINK_GET | ||||||
|  |     #warning "NVLINK_GET" | ||||||
|  |     this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his | ||||||
|  |     // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check | ||||||
|  |     // Or issue barrier AFTER the DMA is running | ||||||
|  | #endif     | ||||||
|   } |   } | ||||||
|  |  | ||||||
| }; | }; | ||||||
|   | |||||||
| @@ -126,14 +126,17 @@ public: | |||||||
|   void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat, |   void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat, | ||||||
|                      const FermionField &A, const FermionField &B, int dag); |                      const FermionField &A, const FermionField &B, int dag); | ||||||
|  |  | ||||||
|   void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, |   void DhopInternal(StencilImpl &st, | ||||||
|  | 		    DoubledGaugeField &U, | ||||||
|                     const FermionField &in, FermionField &out, int dag); |                     const FermionField &in, FermionField &out, int dag); | ||||||
|  |  | ||||||
|   void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, |   void DhopInternalSerial(StencilImpl &st, | ||||||
|                     const FermionField &in, FermionField &out, int dag); | 			  DoubledGaugeField &U, | ||||||
|  | 			  const FermionField &in, FermionField &out, int dag); | ||||||
|  |  | ||||||
|   void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, |   void DhopInternalOverlappedComms(StencilImpl &st, | ||||||
|                     const FermionField &in, FermionField &out, int dag); | 				   DoubledGaugeField &U, | ||||||
|  | 				   const FermionField &in, FermionField &out, int dag); | ||||||
|  |  | ||||||
|   // Constructor |   // Constructor | ||||||
|   WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, |   WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, | ||||||
| @@ -168,9 +171,6 @@ public: | |||||||
|   DoubledGaugeField UmuEven; |   DoubledGaugeField UmuEven; | ||||||
|   DoubledGaugeField UmuOdd; |   DoubledGaugeField UmuOdd; | ||||||
|  |  | ||||||
|   LebesgueOrder Lebesgue; |  | ||||||
|   LebesgueOrder LebesgueEvenOdd; |  | ||||||
|  |  | ||||||
|   WilsonAnisotropyCoefficients anisotropyCoeff; |   WilsonAnisotropyCoefficients anisotropyCoeff; | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -109,6 +109,8 @@ public: | |||||||
|   void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ; |   void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ; | ||||||
|   void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ; |   void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ; | ||||||
|   void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ; |   void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ; | ||||||
|  |   void MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist, | ||||||
|  | 				  std::vector<double> qmu) ; | ||||||
|  |  | ||||||
|   // Implement hopping term non-hermitian hopping term; half cb or both |   // Implement hopping term non-hermitian hopping term; half cb or both | ||||||
|   // Implement s-diagonal DW |   // Implement s-diagonal DW | ||||||
| @@ -117,6 +119,9 @@ public: | |||||||
|   void DhopOE(const FermionField &in, FermionField &out,int dag); |   void DhopOE(const FermionField &in, FermionField &out,int dag); | ||||||
|   void DhopEO(const FermionField &in, FermionField &out,int dag); |   void DhopEO(const FermionField &in, FermionField &out,int dag); | ||||||
|  |  | ||||||
|  |   void DhopComms  (const FermionField &in, FermionField &out); | ||||||
|  |   void DhopCalc   (const FermionField &in, FermionField &out,uint64_t *ids); | ||||||
|  |    | ||||||
|   // add a DhopComm |   // add a DhopComm | ||||||
|   // -- suboptimal interface will presently trigger multiple comms. |   // -- suboptimal interface will presently trigger multiple comms. | ||||||
|   void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); |   void DhopDir(const FermionField &in, FermionField &out,int dir,int disp); | ||||||
| @@ -135,21 +140,18 @@ public: | |||||||
| 		     int dag); | 		     int dag); | ||||||
|      |      | ||||||
|   void DhopInternal(StencilImpl & st, |   void DhopInternal(StencilImpl & st, | ||||||
| 		    LebesgueOrder &lo, |  | ||||||
| 		    DoubledGaugeField &U, | 		    DoubledGaugeField &U, | ||||||
| 		    const FermionField &in,  | 		    const FermionField &in,  | ||||||
| 		    FermionField &out, | 		    FermionField &out, | ||||||
| 		    int dag); | 		    int dag); | ||||||
|  |  | ||||||
|   void DhopInternalOverlappedComms(StencilImpl & st, |   void DhopInternalOverlappedComms(StencilImpl & st, | ||||||
| 				   LebesgueOrder &lo, |  | ||||||
| 				   DoubledGaugeField &U, | 				   DoubledGaugeField &U, | ||||||
| 				   const FermionField &in,  | 				   const FermionField &in,  | ||||||
| 				   FermionField &out, | 				   FermionField &out, | ||||||
| 				   int dag); | 				   int dag); | ||||||
|  |  | ||||||
|   void DhopInternalSerialComms(StencilImpl & st, |   void DhopInternalSerialComms(StencilImpl & st, | ||||||
| 			       LebesgueOrder &lo, |  | ||||||
| 			       DoubledGaugeField &U, | 			       DoubledGaugeField &U, | ||||||
| 			       const FermionField &in,  | 			       const FermionField &in,  | ||||||
| 			       FermionField &out, | 			       FermionField &out, | ||||||
| @@ -203,9 +205,6 @@ public: | |||||||
|   DoubledGaugeField UmuEven; |   DoubledGaugeField UmuEven; | ||||||
|   DoubledGaugeField UmuOdd; |   DoubledGaugeField UmuOdd; | ||||||
|      |      | ||||||
|   LebesgueOrder Lebesgue; |  | ||||||
|   LebesgueOrder LebesgueEvenOdd; |  | ||||||
|      |  | ||||||
|   // Comms buffer |   // Comms buffer | ||||||
|   //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf; |   //  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -57,6 +57,10 @@ public: | |||||||
| 			 int Ls, int Nsite, const FermionField &in, FermionField &out, | 			 int Ls, int Nsite, const FermionField &in, FermionField &out, | ||||||
| 			 int interior=1,int exterior=1) ; | 			 int interior=1,int exterior=1) ; | ||||||
|  |  | ||||||
|  |   static void DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
|  | 			 int Ls, int Nsite, const FermionField &in, FermionField &out, | ||||||
|  | 			 uint64_t *ids); | ||||||
|  |    | ||||||
|   static void DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf, |   static void DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
| 			    int Ls, int Nsite, const FermionField &in, FermionField &out, | 			    int Ls, int Nsite, const FermionField &in, FermionField &out, | ||||||
| 			    int interior=1,int exterior=1) ; | 			    int interior=1,int exterior=1) ; | ||||||
|   | |||||||
| @@ -58,7 +58,7 @@ public: | |||||||
|   { |   { | ||||||
|     //    RealD eps = 1.0; |     //    RealD eps = 1.0; | ||||||
|     std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl; |     std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl; | ||||||
|     Vector<Coeff_t> zgamma(this->Ls); |     std::vector<Coeff_t> zgamma(this->Ls); | ||||||
|     for(int s=0;s<this->Ls;s++){ |     for(int s=0;s<this->Ls;s++){ | ||||||
|       zgamma[s] = gamma[s]; |       zgamma[s] = gamma[s]; | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -1,3 +1,5 @@ | |||||||
|  | #if 0 | ||||||
|  | 
 | ||||||
| /*************************************************************************************
 | /*************************************************************************************
 | ||||||
| 
 | 
 | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
| @@ -818,3 +820,5 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi, | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  | 
 | ||||||
|  | #endif | ||||||
| @@ -1,3 +1,4 @@ | |||||||
|  | #if 0 | ||||||
| /*************************************************************************************
 | /*************************************************************************************
 | ||||||
| 
 | 
 | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
| @@ -241,3 +242,4 @@ void LebesgueOrder::ZGraph(void) | |||||||
| } | } | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
| 
 | 
 | ||||||
|  | #endif | ||||||
| @@ -72,7 +72,7 @@ public: | |||||||
|   void ThreadInterleave(void); |   void ThreadInterleave(void); | ||||||
| 
 | 
 | ||||||
| private: | private: | ||||||
|   Vector<IndexInteger> _LebesgueReorder; |   deviceVector<IndexInteger> _LebesgueReorder; | ||||||
| 
 | 
 | ||||||
| };     | };     | ||||||
| 
 | 
 | ||||||
| @@ -48,7 +48,8 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu, | |||||||
| 			FourDimGrid, | 			FourDimGrid, | ||||||
| 			FourDimRedBlackGrid,_M5,p), | 			FourDimRedBlackGrid,_M5,p), | ||||||
|   mass_plus(_mass), mass_minus(_mass) |   mass_plus(_mass), mass_minus(_mass) | ||||||
| {  | { | ||||||
|  |   // qmu defaults to zero size; | ||||||
| } | } | ||||||
|  |  | ||||||
| /////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////// | ||||||
| @@ -156,18 +157,18 @@ template<class Impl> | |||||||
| void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi) | void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi) | ||||||
| { | { | ||||||
|   int Ls=this->Ls; |   int Ls=this->Ls; | ||||||
|   Vector<Coeff_t> diag (Ls,1.0); |   std::vector<Coeff_t> diag (Ls,1.0); | ||||||
|   Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus; |   std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus; | ||||||
|   Vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus; |   std::vector<Coeff_t> lower(Ls,-1.0); lower[0]   =mass_plus; | ||||||
|   M5D(psi,chi,chi,lower,diag,upper); |   M5D(psi,chi,chi,lower,diag,upper); | ||||||
| } | } | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din) | void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &Din) | ||||||
| { | { | ||||||
|   int Ls=this->Ls; |   int Ls=this->Ls; | ||||||
|   Vector<Coeff_t> diag = bs; |   std::vector<Coeff_t> diag = bs; | ||||||
|   Vector<Coeff_t> upper= cs; |   std::vector<Coeff_t> upper= cs; | ||||||
|   Vector<Coeff_t> lower= cs;  |   std::vector<Coeff_t> lower= cs;  | ||||||
|   upper[Ls-1]=-mass_minus*upper[Ls-1]; |   upper[Ls-1]=-mass_minus*upper[Ls-1]; | ||||||
|   lower[0]   =-mass_plus*lower[0]; |   lower[0]   =-mass_plus*lower[0]; | ||||||
|   M5D(psi,psi,Din,lower,diag,upper); |   M5D(psi,psi,Din,lower,diag,upper); | ||||||
| @@ -176,9 +177,9 @@ void CayleyFermion5D<Impl>::Meooe5D    (const FermionField &psi, FermionField &D | |||||||
| template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi) | template<class Impl> void CayleyFermion5D<Impl>::Meo5D     (const FermionField &psi, FermionField &chi) | ||||||
| { | { | ||||||
|   int Ls=this->Ls; |   int Ls=this->Ls; | ||||||
|   Vector<Coeff_t> diag = beo; |   std::vector<Coeff_t> diag = beo; | ||||||
|   Vector<Coeff_t> upper(Ls); |   std::vector<Coeff_t> upper(Ls); | ||||||
|   Vector<Coeff_t> lower(Ls); |   std::vector<Coeff_t> lower(Ls); | ||||||
|   for(int i=0;i<Ls;i++) { |   for(int i=0;i<Ls;i++) { | ||||||
|     upper[i]=-ceo[i]; |     upper[i]=-ceo[i]; | ||||||
|     lower[i]=-ceo[i]; |     lower[i]=-ceo[i]; | ||||||
| @@ -191,9 +192,9 @@ template<class Impl> | |||||||
| void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi) | void CayleyFermion5D<Impl>::Mooee       (const FermionField &psi, FermionField &chi) | ||||||
| { | { | ||||||
|   int Ls=this->Ls; |   int Ls=this->Ls; | ||||||
|   Vector<Coeff_t> diag = bee; |   std::vector<Coeff_t> diag = bee; | ||||||
|   Vector<Coeff_t> upper(Ls); |   std::vector<Coeff_t> upper(Ls); | ||||||
|   Vector<Coeff_t> lower(Ls); |   std::vector<Coeff_t> lower(Ls); | ||||||
|   for(int i=0;i<Ls;i++) { |   for(int i=0;i<Ls;i++) { | ||||||
|     upper[i]=-cee[i]; |     upper[i]=-cee[i]; | ||||||
|     lower[i]=-cee[i]; |     lower[i]=-cee[i]; | ||||||
| @@ -206,9 +207,9 @@ template<class Impl> | |||||||
| void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi) | void CayleyFermion5D<Impl>::MooeeDag    (const FermionField &psi, FermionField &chi) | ||||||
| { | { | ||||||
|   int Ls=this->Ls; |   int Ls=this->Ls; | ||||||
|   Vector<Coeff_t> diag = bee; |   std::vector<Coeff_t> diag = bee; | ||||||
|   Vector<Coeff_t> upper(Ls); |   std::vector<Coeff_t> upper(Ls); | ||||||
|   Vector<Coeff_t> lower(Ls); |   std::vector<Coeff_t> lower(Ls); | ||||||
|  |  | ||||||
|   for (int s=0;s<Ls;s++){ |   for (int s=0;s<Ls;s++){ | ||||||
|     // Assemble the 5d matrix |     // Assemble the 5d matrix | ||||||
| @@ -236,9 +237,9 @@ template<class Impl> | |||||||
| void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi) | void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi) | ||||||
| { | { | ||||||
|   int Ls=this->Ls; |   int Ls=this->Ls; | ||||||
|   Vector<Coeff_t> diag(Ls,1.0); |   std::vector<Coeff_t> diag(Ls,1.0); | ||||||
|   Vector<Coeff_t> upper(Ls,-1.0); |   std::vector<Coeff_t> upper(Ls,-1.0); | ||||||
|   Vector<Coeff_t> lower(Ls,-1.0); |   std::vector<Coeff_t> lower(Ls,-1.0); | ||||||
|   upper[Ls-1]=-mass_plus*upper[Ls-1]; |   upper[Ls-1]=-mass_plus*upper[Ls-1]; | ||||||
|   lower[0]   =-mass_minus*lower[0]; |   lower[0]   =-mass_minus*lower[0]; | ||||||
|   M5Ddag(psi,chi,chi,lower,diag,upper); |   M5Ddag(psi,chi,chi,lower,diag,upper); | ||||||
| @@ -248,9 +249,9 @@ template<class Impl> | |||||||
| void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din) | void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField &Din) | ||||||
| { | { | ||||||
|   int Ls=this->Ls; |   int Ls=this->Ls; | ||||||
|   Vector<Coeff_t> diag =bs; |   std::vector<Coeff_t> diag =bs; | ||||||
|   Vector<Coeff_t> upper=cs; |   std::vector<Coeff_t> upper=cs; | ||||||
|   Vector<Coeff_t> lower=cs;  |   std::vector<Coeff_t> lower=cs;  | ||||||
|  |  | ||||||
|   for (int s=0;s<Ls;s++){ |   for (int s=0;s<Ls;s++){ | ||||||
|     if ( s== 0 ) { |     if ( s== 0 ) { | ||||||
| @@ -270,6 +271,34 @@ void CayleyFermion5D<Impl>::MeooeDag5D    (const FermionField &psi, FermionField | |||||||
|   M5Ddag(psi,psi,Din,lower,diag,upper); |   M5Ddag(psi,psi,Din,lower,diag,upper); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | template<class Impl> | ||||||
|  | void CayleyFermion5D<Impl>::addQmu(const FermionField &psi,FermionField &chi, int dag) | ||||||
|  | { | ||||||
|  |   if ( qmu.size() ) { | ||||||
|  |  | ||||||
|  |     Gamma::Algebra Gmu [] = { | ||||||
|  |       Gamma::Algebra::GammaX, | ||||||
|  |       Gamma::Algebra::GammaY, | ||||||
|  |       Gamma::Algebra::GammaZ, | ||||||
|  |       Gamma::Algebra::GammaT | ||||||
|  |     }; | ||||||
|  |     std::vector<ComplexD> coeff(Nd); | ||||||
|  |     ComplexD ci(0,1); | ||||||
|  |  | ||||||
|  |     assert(qmu.size()==Nd); | ||||||
|  |  | ||||||
|  |     for(int mu=0;mu<Nd;mu++){ | ||||||
|  |        coeff[mu] = ci*qmu[mu]; | ||||||
|  |        if ( dag ) coeff[mu] = conjugate(coeff[mu]); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     chi = chi + Gamma(Gmu[0])*psi*coeff[0]; | ||||||
|  |     for(int mu=1;mu<Nd;mu++){ | ||||||
|  |       chi = chi + Gamma(Gmu[mu])*psi*coeff[mu]; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi) | void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi) | ||||||
| { | { | ||||||
| @@ -277,8 +306,12 @@ void CayleyFermion5D<Impl>::M    (const FermionField &psi, FermionField &chi) | |||||||
|    |    | ||||||
|   // Assemble Din |   // Assemble Din | ||||||
|   Meooe5D(psi,Din); |   Meooe5D(psi,Din); | ||||||
|    |  | ||||||
|   this->DW(Din,chi,DaggerNo); |   this->DW(Din,chi,DaggerNo); | ||||||
|  |  | ||||||
|  |   // add i q_mu gamma_mu here | ||||||
|  |   addQmu(Din,chi,DaggerNo); | ||||||
|  |    | ||||||
|   // ((b D_W + D_w hop terms +1) on s-diag |   // ((b D_W + D_w hop terms +1) on s-diag | ||||||
|   axpby(chi,1.0,1.0,chi,psi);  |   axpby(chi,1.0,1.0,chi,psi);  | ||||||
|    |    | ||||||
| @@ -295,6 +328,9 @@ void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi) | |||||||
|   FermionField Din(psi.Grid()); |   FermionField Din(psi.Grid()); | ||||||
|   // Apply Dw |   // Apply Dw | ||||||
|   this->DW(psi,Din,DaggerYes);  |   this->DW(psi,Din,DaggerYes);  | ||||||
|  |  | ||||||
|  |   // add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not. | ||||||
|  |   addQmu(psi,Din,DaggerYes); | ||||||
|    |    | ||||||
|   MeooeDag5D(Din,chi); |   MeooeDag5D(Din,chi); | ||||||
|    |    | ||||||
| @@ -394,7 +430,7 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const | |||||||
| template<class Impl> | template<class Impl> | ||||||
| void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c) | void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c) | ||||||
| { | { | ||||||
|   Vector<Coeff_t> gamma(this->Ls); |   std::vector<Coeff_t> gamma(this->Ls); | ||||||
|   for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s]; |   for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s]; | ||||||
|   SetCoefficientsInternal(1.0,gamma,b,c); |   SetCoefficientsInternal(1.0,gamma,b,c); | ||||||
| } | } | ||||||
| @@ -402,13 +438,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re | |||||||
| template<class Impl> | template<class Impl> | ||||||
| void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c) | void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c) | ||||||
| { | { | ||||||
|   Vector<Coeff_t> gamma(this->Ls); |   std::vector<Coeff_t> gamma(this->Ls); | ||||||
|   for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s]; |   for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s]; | ||||||
|   SetCoefficientsInternal(zolo_hi,gamma,b,c); |   SetCoefficientsInternal(zolo_hi,gamma,b,c); | ||||||
| } | } | ||||||
| //Zolo | //Zolo | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c) | void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c) | ||||||
| { | { | ||||||
|   int Ls=this->Ls; |   int Ls=this->Ls; | ||||||
|  |  | ||||||
| @@ -488,7 +524,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t | |||||||
|   leem.resize(Ls); |   leem.resize(Ls); | ||||||
|   uee.resize(Ls); |   uee.resize(Ls); | ||||||
|   ueem.resize(Ls); |   ueem.resize(Ls); | ||||||
|    |  | ||||||
|   for(int i=0;i<Ls;i++){ |   for(int i=0;i<Ls;i++){ | ||||||
|      |      | ||||||
|     dee[i] = bee[i]; |     dee[i] = bee[i]; | ||||||
| @@ -529,6 +565,18 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t | |||||||
|     dee[Ls-1] += delta_d; |     dee[Ls-1] += delta_d; | ||||||
|   }   |   }   | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////// | ||||||
|  |   // Device buffers | ||||||
|  |   ////////////////////////////////////////// | ||||||
|  |   d_diag.resize(Ls); | ||||||
|  |   d_upper.resize(Ls); | ||||||
|  |   d_lower.resize(Ls); | ||||||
|  |  | ||||||
|  |   d_dee.resize(Ls); | ||||||
|  |   d_lee.resize(Ls); | ||||||
|  |   d_uee.resize(Ls); | ||||||
|  |   d_leem.resize(Ls); | ||||||
|  |   d_ueem.resize(Ls); | ||||||
|   //  int inv=1; |   //  int inv=1; | ||||||
|   //  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv); |   //  this->MooeeInternalCompute(0,inv,MatpInv,MatmInv); | ||||||
|   //  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag); |   //  this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag); | ||||||
|   | |||||||
| @@ -43,9 +43,9 @@ void | |||||||
| CayleyFermion5D<Impl>::M5D(const FermionField &psi_i, | CayleyFermion5D<Impl>::M5D(const FermionField &psi_i, | ||||||
| 			       const FermionField &phi_i,  | 			       const FermionField &phi_i,  | ||||||
| 			       FermionField &chi_i, | 			       FermionField &chi_i, | ||||||
| 			       Vector<Coeff_t> &lower, | 			       std::vector<Coeff_t> &lower, | ||||||
| 			       Vector<Coeff_t> &diag, | 			       std::vector<Coeff_t> &diag, | ||||||
| 			       Vector<Coeff_t> &upper) | 			       std::vector<Coeff_t> &upper) | ||||||
| { | { | ||||||
|    |    | ||||||
|   chi_i.Checkerboard()=psi_i.Checkerboard(); |   chi_i.Checkerboard()=psi_i.Checkerboard(); | ||||||
| @@ -55,12 +55,16 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i, | |||||||
|   autoView(chi , chi_i,AcceleratorWrite); |   autoView(chi , chi_i,AcceleratorWrite); | ||||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); |   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||||
|  |  | ||||||
|   auto pdiag = &diag[0]; |  | ||||||
|   auto pupper = &upper[0]; |  | ||||||
|   auto plower = &lower[0]; |  | ||||||
|  |  | ||||||
|   int Ls =this->Ls; |   int Ls =this->Ls; | ||||||
|  |  | ||||||
|  |   acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t)); | ||||||
|  |    | ||||||
|  |   auto pdiag = &d_diag[0]; | ||||||
|  |   auto pupper = &d_upper[0]; | ||||||
|  |   auto plower = &d_lower[0]; | ||||||
|  |  | ||||||
|   // 10 = 3 complex mult + 2 complex add |   // 10 = 3 complex mult + 2 complex add | ||||||
|   // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) |   // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) | ||||||
|   uint64_t nloop = grid->oSites(); |   uint64_t nloop = grid->oSites(); | ||||||
| @@ -82,9 +86,9 @@ void | |||||||
| CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i, | CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i, | ||||||
| 			      const FermionField &phi_i,  | 			      const FermionField &phi_i,  | ||||||
| 			      FermionField &chi_i, | 			      FermionField &chi_i, | ||||||
| 			      Vector<Coeff_t> &lower, | 			      std::vector<Coeff_t> &lower, | ||||||
| 			      Vector<Coeff_t> &diag, | 			      std::vector<Coeff_t> &diag, | ||||||
| 			      Vector<Coeff_t> &upper) | 			      std::vector<Coeff_t> &upper) | ||||||
| { | { | ||||||
|   chi_i.Checkerboard()=psi_i.Checkerboard(); |   chi_i.Checkerboard()=psi_i.Checkerboard(); | ||||||
|   GridBase *grid=psi_i.Grid(); |   GridBase *grid=psi_i.Grid(); | ||||||
| @@ -93,12 +97,16 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i, | |||||||
|   autoView(chi , chi_i,AcceleratorWrite); |   autoView(chi , chi_i,AcceleratorWrite); | ||||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); |   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||||
|  |  | ||||||
|   auto pdiag = &diag[0]; |  | ||||||
|   auto pupper = &upper[0]; |  | ||||||
|   auto plower = &lower[0]; |  | ||||||
|  |  | ||||||
|   int Ls=this->Ls; |   int Ls=this->Ls; | ||||||
|  |  | ||||||
|  |   acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t)); | ||||||
|  |    | ||||||
|  |   auto pdiag = &d_diag[0]; | ||||||
|  |   auto pupper = &d_upper[0]; | ||||||
|  |   auto plower = &d_lower[0]; | ||||||
|  |  | ||||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol |   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||||
|   uint64_t nloop = grid->oSites(); |   uint64_t nloop = grid->oSites(); | ||||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ |   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||||
| @@ -126,11 +134,17 @@ CayleyFermion5D<Impl>::MooeeInv    (const FermionField &psi_i, FermionField &chi | |||||||
|  |  | ||||||
|   int Ls=this->Ls; |   int Ls=this->Ls; | ||||||
|  |  | ||||||
|   auto plee  = & lee [0]; |   acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); | ||||||
|   auto pdee  = & dee [0]; |   acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); | ||||||
|   auto puee  = & uee [0]; |   acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); | ||||||
|   auto pleem = & leem[0]; |   acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); | ||||||
|   auto pueem = & ueem[0]; |   acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); | ||||||
|  |  | ||||||
|  |   auto plee  = & d_lee [0]; | ||||||
|  |   auto pdee  = & d_dee [0]; | ||||||
|  |   auto puee  = & d_uee [0]; | ||||||
|  |   auto pleem = & d_leem[0]; | ||||||
|  |   auto pueem = & d_ueem[0]; | ||||||
|  |  | ||||||
|   uint64_t nloop = grid->oSites()/Ls; |   uint64_t nloop = grid->oSites()/Ls; | ||||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ |   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||||
| @@ -182,11 +196,17 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi | |||||||
|   autoView(psi , psi_i,AcceleratorRead); |   autoView(psi , psi_i,AcceleratorRead); | ||||||
|   autoView(chi , chi_i,AcceleratorWrite); |   autoView(chi , chi_i,AcceleratorWrite); | ||||||
|  |  | ||||||
|   auto plee  = & lee [0]; |   acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); | ||||||
|   auto pdee  = & dee [0]; |   acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); | ||||||
|   auto puee  = & uee [0]; |   acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); | ||||||
|   auto pleem = & leem[0]; |   acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); | ||||||
|   auto pueem = & ueem[0]; |   acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); | ||||||
|  |  | ||||||
|  |   auto plee  = & d_lee [0]; | ||||||
|  |   auto pdee  = & d_dee [0]; | ||||||
|  |   auto puee  = & d_uee [0]; | ||||||
|  |   auto pleem = & d_leem[0]; | ||||||
|  |   auto pueem = & d_ueem[0]; | ||||||
|  |  | ||||||
|   assert(psi.Checkerboard() == psi.Checkerboard()); |   assert(psi.Checkerboard() == psi.Checkerboard()); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -42,13 +42,13 @@ template<class Impl> | |||||||
| void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata) | void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata) | ||||||
| { | { | ||||||
|   // How to check Ls matches?? |   // How to check Ls matches?? | ||||||
|   //      std::cout<<GridLogMessage << Ls << " Ls"<<std::endl; |   std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl; | ||||||
|   //      std::cout<<GridLogMessage << zdata->n  << " - n"<<std::endl; |   std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl; | ||||||
|   //      std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl; |   std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl; | ||||||
|   //      std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl; |   std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl; | ||||||
|   //      std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl; |   std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl; | ||||||
|   //      std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl; |  | ||||||
|   int Ls = this->Ls; |   int Ls = this->Ls; | ||||||
|  |   std::cout<<GridLogMessage << Ls << " Ls"<<std::endl; | ||||||
|   assert(zdata->db==Ls);// Beta has Ls coeffs |   assert(zdata->db==Ls);// Beta has Ls coeffs | ||||||
|  |  | ||||||
|   R=(1+this->mass)/(1-this->mass); |   R=(1+this->mass)/(1-this->mass); | ||||||
| @@ -320,7 +320,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D( | |||||||
|       int Ls = this->Ls; |       int Ls = this->Ls; | ||||||
|       conformable(solution5d.Grid(),this->FermionGrid()); |       conformable(solution5d.Grid(),this->FermionGrid()); | ||||||
|       conformable(exported4d.Grid(),this->GaugeGrid()); |       conformable(exported4d.Grid(),this->GaugeGrid()); | ||||||
|       ExtractSlice(exported4d, solution5d, Ls-1, Ls-1); |       ExtractSlice(exported4d, solution5d, Ls-1, 0); | ||||||
|     } |     } | ||||||
|     template<class Impl> |     template<class Impl> | ||||||
|     void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) |     void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) | ||||||
| @@ -330,7 +330,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D( | |||||||
|       conformable(input4d.Grid()   ,this->GaugeGrid()); |       conformable(input4d.Grid()   ,this->GaugeGrid()); | ||||||
|       FermionField tmp(this->FermionGrid()); |       FermionField tmp(this->FermionGrid()); | ||||||
|       tmp=Zero(); |       tmp=Zero(); | ||||||
|       InsertSlice(input4d, tmp, Ls-1, Ls-1); |       InsertSlice(input4d, tmp, Ls-1, 0); | ||||||
|       tmp=Gamma(Gamma::Algebra::Gamma5)*tmp; |       tmp=Gamma(Gamma::Algebra::Gamma5)*tmp; | ||||||
|       this->Dminus(tmp,imported5d); |       this->Dminus(tmp,imported5d); | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -41,7 +41,7 @@ NAMESPACE_BEGIN(Grid); | |||||||
| // Pplus  backwards.. | // Pplus  backwards.. | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,  | void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,  | ||||||
| 				      Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper) | 				      std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper) | ||||||
| { | { | ||||||
|   chi_i.Checkerboard() = psi_i.Checkerboard(); |   chi_i.Checkerboard() = psi_i.Checkerboard(); | ||||||
|   int Ls = this->Ls; |   int Ls = this->Ls; | ||||||
| @@ -50,9 +50,15 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi | |||||||
|   autoView( psi , psi_i, AcceleratorRead); |   autoView( psi , psi_i, AcceleratorRead); | ||||||
|   autoView( chi , chi_i, AcceleratorWrite); |   autoView( chi , chi_i, AcceleratorWrite); | ||||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); |   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||||
|   auto pdiag = &diag[0]; |  | ||||||
|   auto pupper = &upper[0]; |   auto pdiag  = &this->d_diag[0]; | ||||||
|   auto plower = &lower[0]; |   auto pupper = &this->d_upper[0]; | ||||||
|  |   auto plower = &this->d_lower[0]; | ||||||
|  |  | ||||||
|  |   acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); | ||||||
|  |  | ||||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol |   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||||
|    |    | ||||||
|   auto nloop=grid->oSites()/Ls; |   auto nloop=grid->oSites()/Ls; | ||||||
| @@ -73,7 +79,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi | |||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,  | void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,  | ||||||
| 					 Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper) | 					 std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper) | ||||||
| { | { | ||||||
|   chi_i.Checkerboard() = psi_i.Checkerboard(); |   chi_i.Checkerboard() = psi_i.Checkerboard(); | ||||||
|   GridBase* grid = psi_i.Grid(); |   GridBase* grid = psi_i.Grid(); | ||||||
| @@ -83,9 +89,14 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio | |||||||
|   autoView( phi , phi_i, AcceleratorRead); |   autoView( phi , phi_i, AcceleratorRead); | ||||||
|   autoView( chi , chi_i, AcceleratorWrite); |   autoView( chi , chi_i, AcceleratorWrite); | ||||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); |   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||||
|   auto pdiag = &diag[0]; |    | ||||||
|   auto pupper = &upper[0]; |   auto pdiag  = &this->d_diag[0]; | ||||||
|   auto plower = &lower[0]; |   auto pupper = &this->d_upper[0]; | ||||||
|  |   auto plower = &this->d_lower[0]; | ||||||
|  |  | ||||||
|  |   acceleratorCopyToDevice(&diag[0] ,&pdiag[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); | ||||||
|  |  | ||||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol |   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||||
|  |  | ||||||
| @@ -114,12 +125,17 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie | |||||||
|   autoView( chi, chi_i, AcceleratorWrite); |   autoView( chi, chi_i, AcceleratorWrite); | ||||||
|   int Ls = this->Ls; |   int Ls = this->Ls; | ||||||
|  |  | ||||||
|   auto plee  = & this->lee[0]; |   auto plee  = & this->d_lee [0]; | ||||||
|   auto pdee  = & this->dee[0]; |   auto pdee  = & this->d_dee [0]; | ||||||
|   auto puee  = & this->uee[0]; |   auto puee  = & this->d_uee [0]; | ||||||
|  |   auto pleem = & this->d_leem[0]; | ||||||
|   auto pleem = & this->leem[0]; |   auto pueem = & this->d_ueem[0]; | ||||||
|   auto pueem = & this->ueem[0]; |    | ||||||
|  |   acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); | ||||||
|  |  | ||||||
|   uint64_t nloop=grid->oSites()/Ls; |   uint64_t nloop=grid->oSites()/Ls; | ||||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ |   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||||
|   | |||||||
| @@ -131,9 +131,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi | |||||||
|     else{ shiftm = -shift*(mq3-mq2); } |     else{ shiftm = -shift*(mq3-mq2); } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   Vector<Coeff_t> diag(Ls,1.0); |   std::vector<Coeff_t> diag(Ls,1.0); | ||||||
|   Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm; |   std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm; | ||||||
|   Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp; |   std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp; | ||||||
|  |  | ||||||
| #if(0) | #if(0) | ||||||
|   std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl; |   std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl; | ||||||
| @@ -168,9 +168,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& | |||||||
|     else{ shiftm = -shift*(mq3-mq2); } |     else{ shiftm = -shift*(mq3-mq2); } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   Vector<Coeff_t> diag(Ls,1.0); |   std::vector<Coeff_t> diag(Ls,1.0); | ||||||
|   Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp; |   std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp; | ||||||
|   Vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm; |   std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm; | ||||||
|  |  | ||||||
|   this->M5Ddag(psi, chi, chi, lower, diag, upper); |   this->M5Ddag(psi, chi, chi, lower, diag, upper); | ||||||
| } | } | ||||||
| @@ -181,9 +181,9 @@ void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& c | |||||||
| { | { | ||||||
|   int Ls = this->Ls; |   int Ls = this->Ls; | ||||||
|  |  | ||||||
|   Vector<Coeff_t> diag = this->bee; |   std::vector<Coeff_t> diag = this->bee; | ||||||
|   Vector<Coeff_t> upper(Ls); |   std::vector<Coeff_t> upper(Ls); | ||||||
|   Vector<Coeff_t> lower(Ls); |   std::vector<Coeff_t> lower(Ls); | ||||||
|  |  | ||||||
|   for(int s=0; s<Ls; s++){ |   for(int s=0; s<Ls; s++){ | ||||||
|     upper[s] = -this->cee[s]; |     upper[s] = -this->cee[s]; | ||||||
| @@ -200,9 +200,9 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField | |||||||
| { | { | ||||||
|   int Ls = this->Ls; |   int Ls = this->Ls; | ||||||
|  |  | ||||||
|   Vector<Coeff_t> diag = this->bee; |   std::vector<Coeff_t> diag = this->bee; | ||||||
|   Vector<Coeff_t> upper(Ls); |   std::vector<Coeff_t> upper(Ls); | ||||||
|   Vector<Coeff_t> lower(Ls); |   std::vector<Coeff_t> lower(Ls); | ||||||
|  |  | ||||||
|   for(int s=0; s<Ls; s++){ |   for(int s=0; s<Ls; s++){ | ||||||
|     upper[s] = -this->cee[s]; |     upper[s] = -this->cee[s]; | ||||||
| @@ -218,7 +218,7 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField | |||||||
|  |  | ||||||
| //Zolo | //Zolo | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c) | void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c) | ||||||
| { | { | ||||||
|   int   Ls    = this->Ls; |   int   Ls    = this->Ls; | ||||||
|   int   pm    = this->pm; |   int   pm    = this->pm; | ||||||
|   | |||||||
| @@ -61,8 +61,6 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian | |||||||
|   UUUmu(&FourDimGrid), |   UUUmu(&FourDimGrid), | ||||||
|   UUUmuEven(&FourDimRedBlackGrid), |   UUUmuEven(&FourDimRedBlackGrid), | ||||||
|   UUUmuOdd(&FourDimRedBlackGrid), |   UUUmuOdd(&FourDimRedBlackGrid), | ||||||
|   Lebesgue(&FourDimGrid), |  | ||||||
|   LebesgueEvenOdd(&FourDimRedBlackGrid), |  | ||||||
|   _tmp(&FiveDimRedBlackGrid) |   _tmp(&FiveDimRedBlackGrid) | ||||||
| { | { | ||||||
|  |  | ||||||
| @@ -277,18 +275,18 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat, | |||||||
|  |  | ||||||
| /*CHANGE */ | /*CHANGE */ | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, | void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st,  | ||||||
| 						    DoubledGaugeField & U,DoubledGaugeField & UUU, | 						    DoubledGaugeField & U,DoubledGaugeField & UUU, | ||||||
| 						    const FermionField &in, FermionField &out,int dag) | 						    const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
|   if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) |   if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) | ||||||
|     DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); |     DhopInternalOverlappedComms(st,U,UUU,in,out,dag); | ||||||
|   else |   else | ||||||
|     DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); |     DhopInternalSerialComms(st,U,UUU,in,out,dag); | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, | void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,  | ||||||
| 								   DoubledGaugeField & U,DoubledGaugeField & UUU, | 								   DoubledGaugeField & U,DoubledGaugeField & UUU, | ||||||
| 								   const FermionField &in, FermionField &out,int dag) | 								   const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
| @@ -313,7 +311,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & | |||||||
|   { |   { | ||||||
|     int interior=1; |     int interior=1; | ||||||
|     int exterior=0; |     int exterior=0; | ||||||
|     Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); |     Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   st.CommsMerge(compressor); |   st.CommsMerge(compressor); | ||||||
| @@ -323,12 +321,12 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & | |||||||
|   { |   { | ||||||
|     int interior=0; |     int interior=0; | ||||||
|     int exterior=1; |     int exterior=1; | ||||||
|     Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); |     Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, | void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,  | ||||||
| 						    DoubledGaugeField & U,DoubledGaugeField & UUU, | 						    DoubledGaugeField & U,DoubledGaugeField & UUU, | ||||||
| 						    const FermionField &in, FermionField &out,int dag) | 						    const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
| @@ -341,7 +339,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, | |||||||
|   { |   { | ||||||
|     int interior=1; |     int interior=1; | ||||||
|     int exterior=1; |     int exterior=1; | ||||||
|     Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); |     Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); | ||||||
|   } |   } | ||||||
| } | } | ||||||
| /*CHANGE END*/ | /*CHANGE END*/ | ||||||
| @@ -357,7 +355,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie | |||||||
|   assert(in.Checkerboard()==Even); |   assert(in.Checkerboard()==Even); | ||||||
|   out.Checkerboard() = Odd; |   out.Checkerboard() = Odd; | ||||||
|  |  | ||||||
|   DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag); |   DhopInternal(StencilEven,UmuOdd,UUUmuOdd,in,out,dag); | ||||||
| } | } | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) | void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) | ||||||
| @@ -368,7 +366,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie | |||||||
|   assert(in.Checkerboard()==Odd); |   assert(in.Checkerboard()==Odd); | ||||||
|   out.Checkerboard() = Even; |   out.Checkerboard() = Even; | ||||||
|  |  | ||||||
|   DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag); |   DhopInternal(StencilOdd,UmuEven,UUUmuEven,in,out,dag); | ||||||
| } | } | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) | void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) | ||||||
| @@ -378,7 +376,7 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField | |||||||
|  |  | ||||||
|   out.Checkerboard() = in.Checkerboard(); |   out.Checkerboard() = in.Checkerboard(); | ||||||
|  |  | ||||||
|   DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag); |   DhopInternal(Stencil,Umu,UUUmu,in,out,dag); | ||||||
| } | } | ||||||
|  |  | ||||||
| ///////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -48,8 +48,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G | |||||||
|     StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even |     StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even | ||||||
|     StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd |     StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd | ||||||
|     mass(_mass), |     mass(_mass), | ||||||
|     Lebesgue(_grid), |  | ||||||
|     LebesgueEvenOdd(_cbgrid), |  | ||||||
|     Umu(&Fgrid), |     Umu(&Fgrid), | ||||||
|     UmuEven(&Hgrid), |     UmuEven(&Hgrid), | ||||||
|     UmuOdd(&Hgrid), |     UmuOdd(&Hgrid), | ||||||
| @@ -339,7 +337,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField & | |||||||
|  |  | ||||||
|   out.Checkerboard() = in.Checkerboard(); |   out.Checkerboard() = in.Checkerboard(); | ||||||
|  |  | ||||||
|   DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag); |   DhopInternal(Stencil, Umu, UUUmu, in, out, dag); | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| @@ -351,7 +349,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField | |||||||
|   assert(in.Checkerboard() == Even); |   assert(in.Checkerboard() == Even); | ||||||
|   out.Checkerboard() = Odd; |   out.Checkerboard() = Odd; | ||||||
|  |  | ||||||
|   DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag); |   DhopInternal(StencilEven, UmuOdd, UUUmuOdd, in, out, dag); | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| @@ -363,7 +361,7 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField | |||||||
|   assert(in.Checkerboard() == Odd); |   assert(in.Checkerboard() == Odd); | ||||||
|   out.Checkerboard() = Even; |   out.Checkerboard() = Even; | ||||||
|  |  | ||||||
|   DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag); |   DhopInternal(StencilOdd, UmuEven, UUUmuEven, in, out, dag); | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| @@ -394,19 +392,19 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel | |||||||
|  |  | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo, | void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,  | ||||||
| 						  DoubledGaugeField &U, | 						  DoubledGaugeField &U, | ||||||
| 						  DoubledGaugeField &UUU, | 						  DoubledGaugeField &UUU, | ||||||
| 						  const FermionField &in, | 						  const FermionField &in, | ||||||
| 						  FermionField &out, int dag)  | 						  FermionField &out, int dag)  | ||||||
| { | { | ||||||
|   if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) |   if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) | ||||||
|     DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); |     DhopInternalOverlappedComms(st,U,UUU,in,out,dag); | ||||||
|   else |   else | ||||||
|     DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); |     DhopInternalSerialComms(st,U,UUU,in,out,dag); | ||||||
| } | } | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, | void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,  | ||||||
| 								 DoubledGaugeField &U, | 								 DoubledGaugeField &U, | ||||||
| 								 DoubledGaugeField &UUU, | 								 DoubledGaugeField &UUU, | ||||||
| 								 const FermionField &in, | 								 const FermionField &in, | ||||||
| @@ -429,7 +427,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st | |||||||
|   { |   { | ||||||
|     int interior=1; |     int interior=1; | ||||||
|     int exterior=0; |     int exterior=0; | ||||||
|     Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); |     Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   st.CommunicateComplete(requests); |   st.CommunicateComplete(requests); | ||||||
| @@ -440,13 +438,13 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st | |||||||
|   { |   { | ||||||
|     int interior=0; |     int interior=0; | ||||||
|     int exterior=1; |     int exterior=1; | ||||||
|     Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); |     Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, | void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,  | ||||||
| 							     DoubledGaugeField &U, | 							     DoubledGaugeField &U, | ||||||
| 							     DoubledGaugeField &UUU, | 							     DoubledGaugeField &UUU, | ||||||
| 							     const FermionField &in, | 							     const FermionField &in, | ||||||
| @@ -460,7 +458,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le | |||||||
|   { |   { | ||||||
|     int interior=1; |     int interior=1; | ||||||
|     int exterior=1; |     int exterior=1; | ||||||
|     Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); |     Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid); | |||||||
|   |   | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, | void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, | ||||||
| 				  Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper) | 				  std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper) | ||||||
| { | { | ||||||
|   chi_i.Checkerboard() = psi_i.Checkerboard(); |   chi_i.Checkerboard() = psi_i.Checkerboard(); | ||||||
|   GridBase *grid = psi_i.Grid(); |   GridBase *grid = psi_i.Grid(); | ||||||
| @@ -50,10 +50,14 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField | |||||||
|  |  | ||||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); |   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||||
|  |  | ||||||
|   auto pdiag = &diag[0]; |   auto pdiag  = &this->d_diag[0]; | ||||||
|   auto pupper = &upper[0]; |   auto pupper = &this->d_upper[0]; | ||||||
|   auto plower = &lower[0]; |   auto plower = &this->d_lower[0]; | ||||||
|  |  | ||||||
|  |   acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); | ||||||
|  |    | ||||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol |   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||||
|   int nloop = grid->oSites()/Ls; |   int nloop = grid->oSites()/Ls; | ||||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ |   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||||
| @@ -74,8 +78,8 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField | |||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, | void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, | ||||||
| 					Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper, | 					std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper, | ||||||
| 					Vector<Coeff_t> &shift_coeffs) | 					std::vector<Coeff_t> &shift_coeffs) | ||||||
| { | { | ||||||
|   chi_i.Checkerboard() = psi_i.Checkerboard(); |   chi_i.Checkerboard() = psi_i.Checkerboard(); | ||||||
|   GridBase *grid = psi_i.Grid(); |   GridBase *grid = psi_i.Grid(); | ||||||
| @@ -86,13 +90,18 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion | |||||||
|  |  | ||||||
|   auto pm  = this->pm; |   auto pm  = this->pm; | ||||||
|   int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator |   int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator | ||||||
|  |    | ||||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); |   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||||
|  |  | ||||||
|   auto pdiag = &diag[0]; |   auto pdiag  = &this->d_diag[0]; | ||||||
|   auto pupper = &upper[0]; |   auto pupper = &this->d_upper[0]; | ||||||
|   auto plower = &lower[0]; |   auto plower = &this->d_lower[0]; | ||||||
|   auto pshift_coeffs = &shift_coeffs[0]; |   auto pshift_coeffs = &this->d_shift_coefficients[0]; | ||||||
|  |  | ||||||
|  |   acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t)); | ||||||
|  |  | ||||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol |   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||||
|   int nloop = grid->oSites()/Ls; |   int nloop = grid->oSites()/Ls; | ||||||
| @@ -119,7 +128,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion | |||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, | void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, | ||||||
| 				     Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper) | 				     std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper) | ||||||
| { | { | ||||||
|   chi_i.Checkerboard() = psi_i.Checkerboard(); |   chi_i.Checkerboard() = psi_i.Checkerboard(); | ||||||
|   GridBase *grid = psi_i.Grid(); |   GridBase *grid = psi_i.Grid(); | ||||||
| @@ -129,10 +138,14 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie | |||||||
|   autoView(chi , chi_i, AcceleratorWrite); |   autoView(chi , chi_i, AcceleratorWrite); | ||||||
|  |  | ||||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); |   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||||
|  |    | ||||||
|  |   auto pdiag  = &this->d_diag[0]; | ||||||
|  |   auto pupper = &this->d_upper[0]; | ||||||
|  |   auto plower = &this->d_lower[0]; | ||||||
|  |  | ||||||
|   auto pdiag = &diag[0]; |   acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); | ||||||
|   auto pupper = &upper[0]; |   acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); | ||||||
|   auto plower = &lower[0]; |   acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); | ||||||
|  |  | ||||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol |   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||||
|   int nloop = grid->oSites()/Ls; |   int nloop = grid->oSites()/Ls; | ||||||
| @@ -154,8 +167,8 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie | |||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, | void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, | ||||||
| 					   Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper, | 					   std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper, | ||||||
| 					   Vector<Coeff_t> &shift_coeffs) | 					   std::vector<Coeff_t> &shift_coeffs) | ||||||
| { | { | ||||||
|   chi_i.Checkerboard() = psi_i.Checkerboard(); |   chi_i.Checkerboard() = psi_i.Checkerboard(); | ||||||
|   GridBase *grid = psi_i.Grid(); |   GridBase *grid = psi_i.Grid(); | ||||||
| @@ -167,11 +180,16 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm | |||||||
|  |  | ||||||
|   assert(phi.Checkerboard() == psi.Checkerboard()); |   assert(phi.Checkerboard() == psi.Checkerboard()); | ||||||
|  |  | ||||||
|   auto pdiag = &diag[0]; |   auto pdiag  = &this->d_diag[0]; | ||||||
|   auto pupper = &upper[0]; |   auto pupper = &this->d_upper[0]; | ||||||
|   auto plower = &lower[0]; |   auto plower = &this->d_lower[0]; | ||||||
|   auto pshift_coeffs = &shift_coeffs[0]; |   auto pshift_coeffs = &this->d_shift_coefficients[0]; | ||||||
|  |  | ||||||
|  |   acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t)); | ||||||
|  |    | ||||||
|   // Flops = 6.0*(Nc*Ns) *Ls*vol |   // Flops = 6.0*(Nc*Ns) *Ls*vol | ||||||
|   auto pm = this->pm; |   auto pm = this->pm; | ||||||
|  |  | ||||||
| @@ -212,11 +230,17 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField & | |||||||
|   autoView(psi , psi_i, AcceleratorRead); |   autoView(psi , psi_i, AcceleratorRead); | ||||||
|   autoView(chi , chi_i, AcceleratorWrite); |   autoView(chi , chi_i, AcceleratorWrite); | ||||||
|  |  | ||||||
|   auto plee = & this->lee [0]; |   auto plee  = & this->d_lee [0]; | ||||||
|   auto pdee = & this->dee [0]; |   auto pdee  = & this->d_dee [0]; | ||||||
|   auto puee = & this->uee [0]; |   auto puee  = & this->d_uee [0]; | ||||||
|   auto pleem= & this->leem[0]; |   auto pleem = & this->d_leem[0]; | ||||||
|   auto pueem= & this->ueem[0]; |   auto pueem = & this->d_ueem[0]; | ||||||
|  |  | ||||||
|  |   acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); | ||||||
|  |  | ||||||
|   if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; } |   if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; } | ||||||
|  |  | ||||||
| @@ -268,14 +292,23 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF | |||||||
|   autoView(psi , psi_i, AcceleratorRead); |   autoView(psi , psi_i, AcceleratorRead); | ||||||
|   autoView(chi , chi_i, AcceleratorWrite); |   autoView(chi , chi_i, AcceleratorWrite); | ||||||
|  |  | ||||||
|  |   // Move into object and constructor | ||||||
|   auto pm = this->pm; |   auto pm = this->pm; | ||||||
|   auto plee = & this->lee [0]; |   auto plee  = & this->d_lee [0]; | ||||||
|   auto pdee = & this->dee [0]; |   auto pdee  = & this->d_dee [0]; | ||||||
|   auto puee = & this->uee [0]; |   auto puee  = & this->d_uee [0]; | ||||||
|   auto pleem= & this->leem[0]; |   auto pleem = & this->d_leem[0]; | ||||||
|   auto pueem= & this->ueem[0]; |   auto pueem = & this->d_ueem[0]; | ||||||
|   auto pMooeeInv_shift_lc   = &MooeeInv_shift_lc[0]; |   auto pMooeeInv_shift_lc   = &this->d_MooeeInv_shift_lc[0]; | ||||||
|   auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0]; |   auto pMooeeInv_shift_norm = &this->d_MooeeInv_shift_norm[0]; | ||||||
|  |  | ||||||
|  |   acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&pMooeeInv_shift_lc[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&pMooeeInv_shift_norm[0],Ls*sizeof(Coeff_t)); | ||||||
|  |  | ||||||
|   int nloop = grid->oSites()/Ls; |   int nloop = grid->oSites()/Ls; | ||||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ |   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||||
| @@ -333,11 +366,17 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel | |||||||
|   autoView(psi , psi_i, AcceleratorRead); |   autoView(psi , psi_i, AcceleratorRead); | ||||||
|   autoView(chi , chi_i, AcceleratorWrite); |   autoView(chi , chi_i, AcceleratorWrite); | ||||||
|  |  | ||||||
|   auto plee = & this->lee [0]; |   auto plee  = &this->d_lee [0]; | ||||||
|   auto pdee = & this->dee [0]; |   auto pdee  = &this->d_dee [0]; | ||||||
|   auto puee = & this->uee [0]; |   auto puee  = &this->d_uee [0]; | ||||||
|   auto pleem= & this->leem[0]; |   auto pleem = &this->d_leem[0]; | ||||||
|   auto pueem= & this->ueem[0]; |   auto pueem = &this->d_ueem[0]; | ||||||
|  |  | ||||||
|  |   acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); | ||||||
|  |  | ||||||
|   int nloop = grid->oSites()/Ls; |   int nloop = grid->oSites()/Ls; | ||||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ |   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||||
| @@ -387,13 +426,25 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi | |||||||
|   int Ls = this->Ls; |   int Ls = this->Ls; | ||||||
|  |  | ||||||
|   auto pm = this->pm; |   auto pm = this->pm; | ||||||
|   auto plee = & this->lee [0]; |   auto plee  = & this->d_lee [0]; | ||||||
|   auto pdee = & this->dee [0]; |   auto pdee  = & this->d_dee [0]; | ||||||
|   auto puee = & this->uee [0]; |   auto puee  = & this->d_uee [0]; | ||||||
|   auto pleem= & this->leem[0]; |   auto pleem = & this->d_leem[0]; | ||||||
|   auto pueem= & this->ueem[0]; |   auto pueem = & this->d_ueem[0]; | ||||||
|   auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0]; |  | ||||||
|   auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0]; |   auto pMooeeInvDag_shift_lc   = &this->d_MooeeInv_shift_lc[0]; | ||||||
|  |   auto pMooeeInvDag_shift_norm = &this->d_MooeeInv_shift_norm[0]; | ||||||
|  |  | ||||||
|  |   acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&pMooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t)); | ||||||
|  |   acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&pMooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t)); | ||||||
|  |  | ||||||
|  |   //  auto pMooeeInvDag_shift_lc   = &MooeeInvDag_shift_lc[0]; | ||||||
|  |   //  auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0]; | ||||||
|  |  | ||||||
|   int nloop = grid->oSites()/Ls; |   int nloop = grid->oSites()/Ls; | ||||||
|   accelerator_for(sss,nloop,Simd::Nsimd(),{ |   accelerator_for(sss,nloop,Simd::Nsimd(),{ | ||||||
|   | |||||||
| @@ -196,9 +196,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi) | |||||||
| { | { | ||||||
|   int Ls = this->Ls; |   int Ls = this->Ls; | ||||||
|  |  | ||||||
|   Vector<Coeff_t> diag(Ls,1.0); |   std::vector<Coeff_t> diag(Ls,1.0); | ||||||
|   Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1; |   std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1; | ||||||
|   Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1; |   std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1; | ||||||
|  |  | ||||||
|   // no shift term |   // no shift term | ||||||
|   if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); } |   if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); } | ||||||
| @@ -212,9 +212,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi) | |||||||
| { | { | ||||||
|   int Ls = this->Ls; |   int Ls = this->Ls; | ||||||
|  |  | ||||||
|   Vector<Coeff_t> diag(Ls,1.0); |   std::vector<Coeff_t> diag(Ls,1.0); | ||||||
|   Vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1; |   std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1; | ||||||
|   Vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1; |   std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1; | ||||||
|  |  | ||||||
|   // no shift term |   // no shift term | ||||||
|   if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); } |   if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); } | ||||||
| @@ -230,9 +230,9 @@ void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi) | |||||||
|   int Ls = this->Ls; |   int Ls = this->Ls; | ||||||
|  |  | ||||||
|   // coefficients of Mooee |   // coefficients of Mooee | ||||||
|   Vector<Coeff_t> diag = this->bee; |   std::vector<Coeff_t> diag = this->bee; | ||||||
|   Vector<Coeff_t> upper(Ls); |   std::vector<Coeff_t> upper(Ls); | ||||||
|   Vector<Coeff_t> lower(Ls); |   std::vector<Coeff_t> lower(Ls); | ||||||
|   for(int s=0; s<Ls; s++){ |   for(int s=0; s<Ls; s++){ | ||||||
|     upper[s] = -this->cee[s]; |     upper[s] = -this->cee[s]; | ||||||
|     lower[s] = -this->cee[s]; |     lower[s] = -this->cee[s]; | ||||||
| @@ -253,9 +253,9 @@ void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& ch | |||||||
|   int Ls = this->Ls; |   int Ls = this->Ls; | ||||||
|  |  | ||||||
|   // coefficients of MooeeDag |   // coefficients of MooeeDag | ||||||
|   Vector<Coeff_t> diag = this->bee; |   std::vector<Coeff_t> diag = this->bee; | ||||||
|   Vector<Coeff_t> upper(Ls); |   std::vector<Coeff_t> upper(Ls); | ||||||
|   Vector<Coeff_t> lower(Ls); |   std::vector<Coeff_t> lower(Ls); | ||||||
|   for(int s=0; s<Ls; s++){ |   for(int s=0; s<Ls; s++){ | ||||||
|     if(s==0) { |     if(s==0) { | ||||||
|       upper[s] = -this->cee[s+1]; |       upper[s] = -this->cee[s+1]; | ||||||
| @@ -314,10 +314,10 @@ void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps() | |||||||
|   // Tridiagonal solve for MooeeInvDag_shift_lc |   // Tridiagonal solve for MooeeInvDag_shift_lc | ||||||
|   { |   { | ||||||
|     Coeff_t m(0.0); |     Coeff_t m(0.0); | ||||||
|     Vector<Coeff_t> d = Mooee_shift; |     std::vector<Coeff_t> d = Mooee_shift; | ||||||
|     Vector<Coeff_t> u(Ls,0.0); |     std::vector<Coeff_t> u(Ls,0.0); | ||||||
|     Vector<Coeff_t> y(Ls,0.0); |     std::vector<Coeff_t> y(Ls,0.0); | ||||||
|     Vector<Coeff_t> q(Ls,0.0); |     std::vector<Coeff_t> q(Ls,0.0); | ||||||
|     if(pm == 1){ u[0] = 1.0; } |     if(pm == 1){ u[0] = 1.0; } | ||||||
|     else{ u[Ls-1] = 1.0; } |     else{ u[Ls-1] = 1.0; } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -48,8 +48,6 @@ NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRed | |||||||
|     StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even |     StencilEven(&Hgrid, npoint, Even, directions, displacements,p),  // source is Even | ||||||
|     StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd |     StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p),  // source is Odd | ||||||
|     mass(_mass), |     mass(_mass), | ||||||
|     Lebesgue(_grid), |  | ||||||
|     LebesgueEvenOdd(_cbgrid), |  | ||||||
|     Umu(&Fgrid), |     Umu(&Fgrid), | ||||||
|     UmuEven(&Hgrid), |     UmuEven(&Hgrid), | ||||||
|     UmuOdd(&Hgrid), |     UmuOdd(&Hgrid), | ||||||
| @@ -268,7 +266,7 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out | |||||||
|  |  | ||||||
|   out.Checkerboard() = in.Checkerboard(); |   out.Checkerboard() = in.Checkerboard(); | ||||||
|  |  | ||||||
|   DhopInternal(Stencil, Lebesgue, Umu, in, out, dag); |   DhopInternal(Stencil, Umu, in, out, dag); | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| @@ -280,7 +278,7 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o | |||||||
|   assert(in.Checkerboard() == Even); |   assert(in.Checkerboard() == Even); | ||||||
|   out.Checkerboard() = Odd; |   out.Checkerboard() = Odd; | ||||||
|  |  | ||||||
|   DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag); |   DhopInternal(StencilEven, UmuOdd, in, out, dag); | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| @@ -292,7 +290,7 @@ void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &o | |||||||
|   assert(in.Checkerboard() == Odd); |   assert(in.Checkerboard() == Odd); | ||||||
|   out.Checkerboard() = Even; |   out.Checkerboard() = Even; | ||||||
|  |  | ||||||
|   DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag); |   DhopInternal(StencilOdd, UmuEven, in, out, dag); | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| @@ -323,18 +321,18 @@ void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField & | |||||||
|  |  | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo, | void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, | ||||||
| 					       DoubledGaugeField &U, | 					       DoubledGaugeField &U, | ||||||
| 					       const FermionField &in, | 					       const FermionField &in, | ||||||
| 					       FermionField &out, int dag)  | 					       FermionField &out, int dag)  | ||||||
| { | { | ||||||
|   if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) |   if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) | ||||||
|     DhopInternalOverlappedComms(st,lo,U,in,out,dag); |     DhopInternalOverlappedComms(st,U,in,out,dag); | ||||||
|   else |   else | ||||||
|     DhopInternalSerialComms(st,lo,U,in,out,dag); |     DhopInternalSerialComms(st,U,in,out,dag); | ||||||
| } | } | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, | void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, | ||||||
| 							      DoubledGaugeField &U, | 							      DoubledGaugeField &U, | ||||||
| 							      const FermionField &in, | 							      const FermionField &in, | ||||||
| 							      FermionField &out, int dag)  | 							      FermionField &out, int dag)  | ||||||
| @@ -356,7 +354,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L | |||||||
|   { |   { | ||||||
|     int interior=1; |     int interior=1; | ||||||
|     int exterior=0; |     int exterior=0; | ||||||
|     Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); |     Kernels::DhopNaive(st,U,in,out,dag,interior,exterior); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   st.CommunicateComplete(requests); |   st.CommunicateComplete(requests); | ||||||
| @@ -367,12 +365,12 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L | |||||||
|   { |   { | ||||||
|     int interior=0; |     int interior=0; | ||||||
|     int exterior=1; |     int exterior=1; | ||||||
|     Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); |     Kernels::DhopNaive(st,U,in,out,dag,interior,exterior); | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, | void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, | ||||||
| 							  DoubledGaugeField &U, | 							  DoubledGaugeField &U, | ||||||
| 							  const FermionField &in, | 							  const FermionField &in, | ||||||
| 							  FermionField &out, int dag)  | 							  FermionField &out, int dag)  | ||||||
| @@ -385,7 +383,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Lebes | |||||||
|   { |   { | ||||||
|     int interior=1; |     int interior=1; | ||||||
|     int exterior=1; |     int exterior=1; | ||||||
|     Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); |     Kernels::DhopNaive(st,U,in,out,dag,interior,exterior); | ||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -237,7 +237,32 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi | |||||||
|   //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H |   //           ( 0     -sqrt(p_i)*amax   |  2 R gamma_5 + p0/amax 2H | ||||||
|   // |   // | ||||||
|  |  | ||||||
|   this->DW(psi,D,DaggerNo);  |   this->DW(psi,D,DaggerNo); | ||||||
|  |  | ||||||
|  |   // DW - DW+iqslash | ||||||
|  |   //  (g5 Dw)^dag = g5 Dw | ||||||
|  |   //  (iqmu g5 gmu)^dag = (-i qmu gmu^dag g5^dag) = i qmu g5 gmu | ||||||
|  |   if ( qmu.size() ) { | ||||||
|  |  | ||||||
|  |     std::cout<< "Mat" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl; | ||||||
|  |     assert(qmu.size()==Nd); | ||||||
|  |  | ||||||
|  |     FermionField qslash_psi(psi.Grid()); | ||||||
|  |  | ||||||
|  |     Gamma::Algebra Gmu [] = { | ||||||
|  | 			     Gamma::Algebra::GammaX, | ||||||
|  | 			     Gamma::Algebra::GammaY, | ||||||
|  | 			     Gamma::Algebra::GammaZ, | ||||||
|  | 			     Gamma::Algebra::GammaT | ||||||
|  |     }; | ||||||
|  |     qslash_psi = qmu[0]*(Gamma(Gmu[0])*psi); | ||||||
|  |     for(int mu=1;mu<Nd;mu++){ | ||||||
|  |       qslash_psi = qslash_psi + qmu[mu]*(Gamma(Gmu[mu])*psi); | ||||||
|  |     } | ||||||
|  |     ComplexD ci(0.0,1.0); | ||||||
|  |     qslash_psi = ci*qslash_psi ; // i qslash | ||||||
|  |     D = D + qslash_psi; | ||||||
|  |   } | ||||||
|  |  | ||||||
|   int nblock=(Ls-1)/2; |   int nblock=(Ls-1)/2; | ||||||
|   for(int b=0;b<nblock;b++){ |   for(int b=0;b<nblock;b++){ | ||||||
| @@ -255,15 +280,55 @@ void   PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi | |||||||
|   } |   } | ||||||
| 	 | 	 | ||||||
|   { |   { | ||||||
|  |     // The 'conventional' Cayley overlap operator is | ||||||
|  |     // | ||||||
|  |     // Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw | ||||||
|  |     // | ||||||
|  |     // | ||||||
|  |     // With massless limit 1/2(1+g5 sgnHw) | ||||||
|  |     // | ||||||
|  |     // Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2) | ||||||
|  |     // | ||||||
|  |     // However, the conventional normalisation has both a leading order factor of 2 in Zq | ||||||
|  |     // at tree level AND a mass dependent (1-m) that are convenient to absorb. | ||||||
|  |     // | ||||||
|  |     // In WilsonFermion5DImplementation.h, the tree level propagator for Hw is | ||||||
|  |     // | ||||||
|  |     // num = -i sin kmu gmu | ||||||
|  |     // | ||||||
|  |     // denom ( sqrt(sk^2 + (2shk^2 - 1)^2 | ||||||
|  |     //    b_k = sk2 - M5; | ||||||
|  |     //      | ||||||
|  |     //    w_k = sqrt(sk + b_k*b_k); | ||||||
|  |     // | ||||||
|  |     //    denom= ( w_k + b_k + mass*mass) ; | ||||||
|  |     // | ||||||
|  |     //    denom= one/denom; | ||||||
|  |     //    out = num*denom; | ||||||
|  |     // | ||||||
|  |     // Chroma, and Grid define partial fraction via 4d operator | ||||||
|  |     // | ||||||
|  |     //   Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw | ||||||
|  |     // | ||||||
|  |     // Now since: | ||||||
|  |     // | ||||||
|  |     //      (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m) | ||||||
|  |     // | ||||||
|  |     // This corresponds to a modified mass parameter | ||||||
|  |     // | ||||||
|  |     // It has an annoying  | ||||||
|  |     // | ||||||
|  |     //  | ||||||
|     double R=(1+this->mass)/(1-this->mass); |     double R=(1+this->mass)/(1-this->mass); | ||||||
|     //R g5 psi[Ls] + p[0] H |     //R g5 psi[Ls] + p[0] Hw | ||||||
|     ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1); |     ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1); | ||||||
| 	 |      | ||||||
|     for(int b=0;b<nblock;b++){ |     for(int b=0;b<nblock;b++){ | ||||||
|       int s = 2*b+1; |       int s = 2*b+1; | ||||||
|       double pp = p[nblock-1-b]; |       double pp = p[nblock-1-b]; | ||||||
|       axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s); |       axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s); | ||||||
|     } |     } | ||||||
|  |     | ||||||
|   } |   } | ||||||
|  |  | ||||||
| } | } | ||||||
| @@ -411,17 +476,18 @@ void  PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App | |||||||
|       int Ls = this->Ls; |       int Ls = this->Ls; | ||||||
|       conformable(solution5d.Grid(),this->FermionGrid()); |       conformable(solution5d.Grid(),this->FermionGrid()); | ||||||
|       conformable(exported4d.Grid(),this->GaugeGrid()); |       conformable(exported4d.Grid(),this->GaugeGrid()); | ||||||
|       ExtractSlice(exported4d, solution5d, Ls-1, Ls-1); |       ExtractSlice(exported4d, solution5d, Ls-1, 0); | ||||||
|     } |     } | ||||||
|     template<class Impl> |     template<class Impl> | ||||||
|     void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) |     void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d) | ||||||
|     { |     { | ||||||
|  |       //void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog) | ||||||
|       int Ls = this->Ls; |       int Ls = this->Ls; | ||||||
|       conformable(imported5d.Grid(),this->FermionGrid()); |       conformable(imported5d.Grid(),this->FermionGrid()); | ||||||
|       conformable(input4d.Grid()   ,this->GaugeGrid()); |       conformable(input4d.Grid()   ,this->GaugeGrid()); | ||||||
|       FermionField tmp(this->FermionGrid()); |       FermionField tmp(this->FermionGrid()); | ||||||
|       tmp=Zero(); |       tmp=Zero(); | ||||||
|       InsertSlice(input4d, tmp, Ls-1, Ls-1); |       InsertSlice(input4d, tmp, Ls-1, 0); | ||||||
|       tmp=Gamma(Gamma::Algebra::Gamma5)*tmp; |       tmp=Gamma(Gamma::Algebra::Gamma5)*tmp; | ||||||
|       this->Dminus(tmp,imported5d); |       this->Dminus(tmp,imported5d); | ||||||
|     } |     } | ||||||
| @@ -442,7 +508,7 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu, | |||||||
|  |  | ||||||
| { | { | ||||||
|   int Ls = this->Ls; |   int Ls = this->Ls; | ||||||
|  |   qmu.resize(0); | ||||||
|   assert((Ls&0x1)==1); // Odd Ls required |   assert((Ls&0x1)==1); // Odd Ls required | ||||||
|   int nrational=Ls-1; |   int nrational=Ls-1; | ||||||
|  |  | ||||||
| @@ -460,6 +526,22 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu, | |||||||
|   Approx::zolotarev_free(zdata); |   Approx::zolotarev_free(zdata); | ||||||
|  |  | ||||||
| } | } | ||||||
|  | template<class Impl> | ||||||
|  | PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu, | ||||||
|  | 							 GridCartesian         &FiveDimGrid, | ||||||
|  | 							 GridRedBlackCartesian &FiveDimRedBlackGrid, | ||||||
|  | 							 GridCartesian         &FourDimGrid, | ||||||
|  | 							 GridRedBlackCartesian &FourDimRedBlackGrid, | ||||||
|  | 							 RealD _mass,RealD M5, | ||||||
|  | 							 std::vector<RealD> &_qmu, | ||||||
|  | 							 const ImplParams &p) | ||||||
|  |   : PartialFractionFermion5D<Impl>(_Umu, | ||||||
|  | 			     FiveDimGrid,FiveDimRedBlackGrid, | ||||||
|  | 			     FourDimGrid,FourDimRedBlackGrid, | ||||||
|  | 			     _mass,M5,p) | ||||||
|  | { | ||||||
|  |   qmu=_qmu; | ||||||
|  | } | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -375,23 +375,6 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st, | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| /* |  | ||||||
| #define DHOP_SITE_HAND_INSTANTIATE(IMPL)				\ |  | ||||||
|   template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ |  | ||||||
| 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ |  | ||||||
| 						     SiteSpinor *buf, int LLs, int sU, \ |  | ||||||
| 						     const FermionFieldView &in, FermionFieldView &out, int dag); \ |  | ||||||
| 									\ |  | ||||||
|   template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \ |  | ||||||
| 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ |  | ||||||
| 						     SiteSpinor *buf, int LLs, int sU, \ |  | ||||||
| 						     const FermionFieldView &in, FermionFieldView &out, int dag); \ |  | ||||||
| 									\ |  | ||||||
|   template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \ |  | ||||||
| 						     DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ |  | ||||||
| 						     SiteSpinor *buf, int LLs, int sU, \ |  | ||||||
| 						     const FermionFieldView &in, FermionFieldView &out, int dag); \ |  | ||||||
| */ |  | ||||||
| #undef LOAD_CHI | #undef LOAD_CHI | ||||||
| #undef HAND_DECLARATIONS | #undef HAND_DECLARATIONS | ||||||
|  |  | ||||||
|   | |||||||
| @@ -256,7 +256,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie | |||||||
|   }); |   }); | ||||||
|  |  | ||||||
| template <class Impl>  | template <class Impl>  | ||||||
| void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,  | void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st,  | ||||||
| 					  DoubledGaugeField &U, DoubledGaugeField &UUU,  | 					  DoubledGaugeField &U, DoubledGaugeField &UUU,  | ||||||
| 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior) | 					  const FermionField &in, FermionField &out, int dag, int interior,int exterior) | ||||||
| { | { | ||||||
| @@ -294,7 +294,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, | |||||||
|   assert(0 && " Kernel optimisation case not covered "); |   assert(0 && " Kernel optimisation case not covered "); | ||||||
| } | } | ||||||
| template <class Impl>  | template <class Impl>  | ||||||
| void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,  | void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st,  | ||||||
| 				       DoubledGaugeField &U, | 				       DoubledGaugeField &U, | ||||||
| 				       const FermionField &in, FermionField &out, int dag, int interior,int exterior) | 				       const FermionField &in, FermionField &out, int dag, int interior,int exterior) | ||||||
| { | { | ||||||
|   | |||||||
| @@ -58,15 +58,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu, | |||||||
|   Umu(_FourDimGrid), |   Umu(_FourDimGrid), | ||||||
|   UmuEven(_FourDimRedBlackGrid), |   UmuEven(_FourDimRedBlackGrid), | ||||||
|   UmuOdd (_FourDimRedBlackGrid), |   UmuOdd (_FourDimRedBlackGrid), | ||||||
|   Lebesgue(_FourDimGrid), |  | ||||||
|   LebesgueEvenOdd(_FourDimRedBlackGrid), |  | ||||||
|   _tmp(&FiveDimRedBlackGrid), |   _tmp(&FiveDimRedBlackGrid), | ||||||
|   Dirichlet(0) |   Dirichlet(0) | ||||||
| { | { | ||||||
|   Stencil.lo     = &Lebesgue; |  | ||||||
|   StencilEven.lo = &LebesgueEvenOdd; |  | ||||||
|   StencilOdd.lo  = &LebesgueEvenOdd; |  | ||||||
|    |  | ||||||
|   // some assertions |   // some assertions | ||||||
|   assert(FiveDimGrid._ndimension==5); |   assert(FiveDimGrid._ndimension==5); | ||||||
|   assert(FourDimGrid._ndimension==4); |   assert(FourDimGrid._ndimension==4); | ||||||
| @@ -305,19 +299,19 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat, | |||||||
| } | } | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, | void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, | ||||||
|                                          DoubledGaugeField & U, |                                          DoubledGaugeField & U, | ||||||
|                                          const FermionField &in, FermionField &out,int dag) |                                          const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) |   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) | ||||||
|     DhopInternalOverlappedComms(st,lo,U,in,out,dag); |     DhopInternalOverlappedComms(st,U,in,out,dag); | ||||||
|   else  |   else  | ||||||
|     DhopInternalSerialComms(st,lo,U,in,out,dag); |     DhopInternalSerialComms(st,U,in,out,dag); | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, | void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, | ||||||
| 							DoubledGaugeField & U, | 							DoubledGaugeField & U, | ||||||
| 							const FermionField &in, FermionField &out,int dag) | 							const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
| @@ -331,22 +325,22 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
|   // Start comms  // Gather intranode and extra node differentiated?? |   // Start comms  // Gather intranode and extra node differentiated?? | ||||||
|   ///////////////////////////// |   ///////////////////////////// | ||||||
|   { |   { | ||||||
|  |     //    std::cout << " WilsonFermion5D gather " <<std::endl; | ||||||
|     GRID_TRACE("Gather"); |     GRID_TRACE("Gather"); | ||||||
|     st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine |     st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine | ||||||
|   } |   } | ||||||
|    |    | ||||||
|  |   //  std::cout << " WilsonFermion5D Communicate Begin " <<std::endl; | ||||||
|   std::vector<std::vector<CommsRequest_t> > requests; |   std::vector<std::vector<CommsRequest_t> > requests; | ||||||
|   auto id=traceStart("Communicate overlapped"); |  | ||||||
|   st.CommunicateBegin(requests); |  | ||||||
|  |  | ||||||
|  | #if 1 | ||||||
|   ///////////////////////////// |   ///////////////////////////// | ||||||
|   // Overlap with comms |   // Overlap with comms | ||||||
|   ///////////////////////////// |   ///////////////////////////// | ||||||
|   { |   st.CommunicateBegin(requests); | ||||||
|     GRID_TRACE("MergeSHM"); |   st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms  | ||||||
|     st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms | #endif | ||||||
|   } |  | ||||||
|        |  | ||||||
|   ///////////////////////////// |   ///////////////////////////// | ||||||
|   // do the compute interior |   // do the compute interior | ||||||
|   ///////////////////////////// |   ///////////////////////////// | ||||||
| @@ -358,22 +352,35 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
|     GRID_TRACE("DhopInterior"); |     GRID_TRACE("DhopInterior"); | ||||||
|     Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0); |     Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0); | ||||||
|   } |   } | ||||||
|  |    | ||||||
|  |   //ifdef GRID_ACCELERATED | ||||||
|  | #if 0 | ||||||
|  |   ///////////////////////////// | ||||||
|  |   // Overlap with comms -- on GPU the interior kernel call is nonblocking | ||||||
|  |   ///////////////////////////// | ||||||
|  |   st.CommunicateBegin(requests); | ||||||
|  |   st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms | ||||||
|  | #endif | ||||||
|  |    | ||||||
|  |    | ||||||
|   ///////////////////////////// |   ///////////////////////////// | ||||||
|   // Complete comms |   // Complete comms | ||||||
|   ///////////////////////////// |   ///////////////////////////// | ||||||
|  |   //  std::cout << " WilsonFermion5D Comms Complete " <<std::endl; | ||||||
|   st.CommunicateComplete(requests); |   st.CommunicateComplete(requests); | ||||||
|   traceStop(id); |   //  traceStop(id); | ||||||
|  |  | ||||||
|   ///////////////////////////// |   ///////////////////////////// | ||||||
|   // do the compute exterior |   // do the compute exterior | ||||||
|   ///////////////////////////// |   ///////////////////////////// | ||||||
|   { |   { | ||||||
|  |     //    std::cout << " WilsonFermion5D Comms Merge " <<std::endl; | ||||||
|     GRID_TRACE("Merge"); |     GRID_TRACE("Merge"); | ||||||
|     st.CommsMerge(compressor); |     st.CommsMerge(compressor); | ||||||
|   } |   } | ||||||
|    |    | ||||||
|  |  | ||||||
|  |   //  std::cout << " WilsonFermion5D Exterior " <<std::endl; | ||||||
|   if (dag == DaggerYes) { |   if (dag == DaggerYes) { | ||||||
|     GRID_TRACE("DhopDagExterior"); |     GRID_TRACE("DhopDagExterior"); | ||||||
|     Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); |     Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); | ||||||
| @@ -381,11 +388,12 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
|     GRID_TRACE("DhopExterior"); |     GRID_TRACE("DhopExterior"); | ||||||
|     Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); |     Kernels::DhopKernel   (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); | ||||||
|   } |   } | ||||||
|  |   //  std::cout << " WilsonFermion5D Done " <<std::endl; | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, | void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,  | ||||||
| 						    DoubledGaugeField & U, | 						    DoubledGaugeField & U, | ||||||
| 						    const FermionField &in,  | 						    const FermionField &in,  | ||||||
| 						    FermionField &out,int dag) | 						    FermionField &out,int dag) | ||||||
| @@ -395,11 +403,13 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr | |||||||
|  |  | ||||||
|   int LLs = in.Grid()->_rdimensions[0]; |   int LLs = in.Grid()->_rdimensions[0]; | ||||||
|  |  | ||||||
|  |   //  std::cout << " WilsonFermion5D Halo exch " <<std::endl; | ||||||
|   { |   { | ||||||
|     GRID_TRACE("HaloExchange"); |     GRID_TRACE("HaloExchange"); | ||||||
|     st.HaloExchangeOpt(in,compressor); |     st.HaloExchangeOpt(in,compressor); | ||||||
|   } |   } | ||||||
|    |    | ||||||
|  |   //  std::cout << " WilsonFermion5D Dhop " <<std::endl; | ||||||
|   int Opt = WilsonKernelsStatic::Opt; |   int Opt = WilsonKernelsStatic::Opt; | ||||||
|   if (dag == DaggerYes) { |   if (dag == DaggerYes) { | ||||||
|     GRID_TRACE("DhopDag"); |     GRID_TRACE("DhopDag"); | ||||||
| @@ -408,6 +418,7 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr | |||||||
|     GRID_TRACE("Dhop"); |     GRID_TRACE("Dhop"); | ||||||
|     Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out); |     Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out); | ||||||
|   } |   } | ||||||
|  |   //  std::cout << " WilsonFermion5D Done " <<std::endl; | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -420,7 +431,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int | |||||||
|   assert(in.Checkerboard()==Even); |   assert(in.Checkerboard()==Even); | ||||||
|   out.Checkerboard() = Odd; |   out.Checkerboard() = Odd; | ||||||
|  |  | ||||||
|   DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag); |   DhopInternal(StencilEven,UmuOdd,in,out,dag); | ||||||
| } | } | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) | void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) | ||||||
| @@ -431,8 +442,31 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int | |||||||
|   assert(in.Checkerboard()==Odd); |   assert(in.Checkerboard()==Odd); | ||||||
|   out.Checkerboard() = Even; |   out.Checkerboard() = Even; | ||||||
|  |  | ||||||
|   DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag); |   DhopInternal(StencilOdd,UmuEven,in,out,dag); | ||||||
| } | } | ||||||
|  | template<class Impl> | ||||||
|  | void WilsonFermion5D<Impl>::DhopComms(const FermionField &in, FermionField &out) | ||||||
|  | { | ||||||
|  |   int dag =0 ; | ||||||
|  |   conformable(in.Grid(),FermionGrid()); // verifies full grid | ||||||
|  |   conformable(in.Grid(),out.Grid()); | ||||||
|  |   out.Checkerboard() = in.Checkerboard(); | ||||||
|  |   Compressor compressor(dag); | ||||||
|  |   Stencil.HaloExchangeOpt(in,compressor); | ||||||
|  | } | ||||||
|  | template<class Impl> | ||||||
|  | void WilsonFermion5D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids) | ||||||
|  | { | ||||||
|  |   conformable(in.Grid(),FermionGrid()); // verifies full grid | ||||||
|  |   conformable(in.Grid(),out.Grid()); | ||||||
|  |  | ||||||
|  |   out.Checkerboard() = in.Checkerboard(); | ||||||
|  |  | ||||||
|  |   int LLs = in.Grid()->_rdimensions[0]; | ||||||
|  |   int Opt = WilsonKernelsStatic::Opt; | ||||||
|  |   Kernels::DhopKernel(Opt,Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids); | ||||||
|  | } | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) | void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
| @@ -441,7 +475,7 @@ void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int d | |||||||
|  |  | ||||||
|   out.Checkerboard() = in.Checkerboard(); |   out.Checkerboard() = in.Checkerboard(); | ||||||
|  |  | ||||||
|   DhopInternal(Stencil,Lebesgue,Umu,in,out,dag); |   DhopInternal(Stencil,Umu,in,out,dag); | ||||||
| } | } | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag) | void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag) | ||||||
| @@ -735,6 +769,15 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe | |||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) | void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) | ||||||
|  | { | ||||||
|  |   std::vector<double> empty_q(Nd,0.0); | ||||||
|  |   MomentumSpacePropagatorHwQ(out,in,mass,twist,empty_q); | ||||||
|  | } | ||||||
|  | template<class Impl> | ||||||
|  | void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in, | ||||||
|  | 						       RealD mass, | ||||||
|  | 						       std::vector<double> twist, | ||||||
|  | 						       std::vector<double> qmu) | ||||||
| { | { | ||||||
|     Gamma::Algebra Gmu [] = { |     Gamma::Algebra Gmu [] = { | ||||||
|       Gamma::Algebra::GammaX, |       Gamma::Algebra::GammaX, | ||||||
| @@ -750,6 +793,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe | |||||||
|     typedef typename FermionField::scalar_type ScalComplex; |     typedef typename FermionField::scalar_type ScalComplex; | ||||||
|  |  | ||||||
|     typedef Lattice<iSinglet<vector_type> > LatComplex; |     typedef Lattice<iSinglet<vector_type> > LatComplex; | ||||||
|  |     typedef iSpinMatrix<ScalComplex> SpinMat; | ||||||
|  |  | ||||||
|  |  | ||||||
|     Coordinate latt_size   = _grid->_fdimensions; |     Coordinate latt_size   = _grid->_fdimensions; | ||||||
| @@ -767,8 +811,10 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe | |||||||
|     LatComplex kmu(_grid);  |     LatComplex kmu(_grid);  | ||||||
|     ScalComplex ci(0.0,1.0); |     ScalComplex ci(0.0,1.0); | ||||||
|  |  | ||||||
|  |     std::cout<< "Feynman Rule" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl; | ||||||
|  |      | ||||||
|     for(int mu=0;mu<Nd;mu++) { |     for(int mu=0;mu<Nd;mu++) { | ||||||
|  |        | ||||||
|       LatticeCoordinate(kmu,mu); |       LatticeCoordinate(kmu,mu); | ||||||
|  |  | ||||||
|       RealD TwoPiL =  M_PI * 2.0/ latt_size[mu]; |       RealD TwoPiL =  M_PI * 2.0/ latt_size[mu]; | ||||||
| @@ -777,9 +823,18 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe | |||||||
|       kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions |       kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions | ||||||
|  |  | ||||||
|       sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5); |       sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5); | ||||||
|       sk  = sk  + sin(kmu)*sin(kmu);  |  | ||||||
|  |  | ||||||
|       num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in); |       sk = sk + (sin(kmu)+qmu[mu])*(sin(kmu)+qmu[mu]);  | ||||||
|  |  | ||||||
|  |       // Terms for boosted Fermion | ||||||
|  |       // 1/2 [ -i gamma.(sin p + q )     ] | ||||||
|  |       //     [ --------------------- + 1 ] | ||||||
|  |       //     [         wq + b            ] | ||||||
|  |       // | ||||||
|  |       // wq = sqrt( (sinp+q)^2 + b^2 ) | ||||||
|  |       // | ||||||
|  |        | ||||||
|  |       num = num - (sin(kmu)+qmu[mu])*ci*(Gamma(Gmu[mu])*in); | ||||||
|  |  | ||||||
|     } |     } | ||||||
|     num = num + mass * in ; |     num = num + mass * in ; | ||||||
|   | |||||||
| @@ -52,17 +52,12 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, | |||||||
|     StencilEven(&Hgrid, npoint, Even, directions,displacements,p),  // source is Even |     StencilEven(&Hgrid, npoint, Even, directions,displacements,p),  // source is Even | ||||||
|     StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p),  // source is Odd |     StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p),  // source is Odd | ||||||
|     mass(_mass), |     mass(_mass), | ||||||
|     Lebesgue(_grid), |  | ||||||
|     LebesgueEvenOdd(_cbgrid), |  | ||||||
|     Umu(&Fgrid), |     Umu(&Fgrid), | ||||||
|     UmuEven(&Hgrid), |     UmuEven(&Hgrid), | ||||||
|     UmuOdd(&Hgrid), |     UmuOdd(&Hgrid), | ||||||
|       _tmp(&Hgrid), |       _tmp(&Hgrid), | ||||||
|       anisotropyCoeff(anis) |       anisotropyCoeff(anis) | ||||||
| { | { | ||||||
|   Stencil.lo     = &Lebesgue; |  | ||||||
|   StencilEven.lo = &LebesgueEvenOdd; |  | ||||||
|   StencilOdd.lo  = &LebesgueEvenOdd; |  | ||||||
|   // Allocate the required comms buffer |   // Allocate the required comms buffer | ||||||
|   ImportGauge(_Umu); |   ImportGauge(_Umu); | ||||||
|   if  (anisotropyCoeff.isAnisotropic){ |   if  (anisotropyCoeff.isAnisotropic){ | ||||||
| @@ -314,7 +309,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da | |||||||
|  |  | ||||||
|   out.Checkerboard() = in.Checkerboard(); |   out.Checkerboard() = in.Checkerboard(); | ||||||
|  |  | ||||||
|   DhopInternal(Stencil, Lebesgue, Umu, in, out, dag); |   DhopInternal(Stencil, Umu, in, out, dag); | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| @@ -326,7 +321,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int | |||||||
|   assert(in.Checkerboard() == Even); |   assert(in.Checkerboard() == Even); | ||||||
|   out.Checkerboard() = Odd; |   out.Checkerboard() = Odd; | ||||||
|  |  | ||||||
|   DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag); |   DhopInternal(StencilEven, UmuOdd, in, out, dag); | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| @@ -338,7 +333,7 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d | |||||||
|   assert(in.Checkerboard() == Odd); |   assert(in.Checkerboard() == Odd); | ||||||
|   out.Checkerboard() = Even; |   out.Checkerboard() = Even; | ||||||
|  |  | ||||||
|   DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag); |   DhopInternal(StencilOdd, UmuEven, in, out, dag); | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| @@ -391,21 +386,21 @@ void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out, | |||||||
| }; | }; | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo, | void WilsonFermion<Impl>::DhopInternal(StencilImpl &st,  | ||||||
|                                        DoubledGaugeField &U, |                                        DoubledGaugeField &U, | ||||||
|                                        const FermionField &in, |                                        const FermionField &in, | ||||||
|                                        FermionField &out, int dag) |                                        FermionField &out, int dag) | ||||||
| { | { | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
|   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) |   if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) | ||||||
|     DhopInternalOverlappedComms(st,lo,U,in,out,dag); |     DhopInternalOverlappedComms(st,U,in,out,dag); | ||||||
|   else |   else | ||||||
| #endif | #endif | ||||||
|     DhopInternalSerial(st,lo,U,in,out,dag); |     DhopInternalSerial(st,U,in,out,dag); | ||||||
| } | } | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, | void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,  | ||||||
| 						      DoubledGaugeField &U, | 						      DoubledGaugeField &U, | ||||||
| 						      const FermionField &in, | 						      const FermionField &in, | ||||||
| 						      FermionField &out, int dag) | 						      FermionField &out, int dag) | ||||||
| @@ -474,10 +469,10 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO | |||||||
|  |  | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, | void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st,  | ||||||
|                                        DoubledGaugeField &U, | 					     DoubledGaugeField &U, | ||||||
|                                        const FermionField &in, | 					     const FermionField &in, | ||||||
|                                        FermionField &out, int dag) | 					     FermionField &out, int dag) | ||||||
| { | { | ||||||
|   GRID_TRACE("DhopSerial"); |   GRID_TRACE("DhopSerial"); | ||||||
|   assert((dag == DaggerNo) || (dag == DaggerYes)); |   assert((dag == DaggerNo) || (dag == DaggerYes)); | ||||||
|   | |||||||
| @@ -40,11 +40,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| /// Switch off the 5d vectorised code optimisations | /// Switch off the 5d vectorised code optimisations | ||||||
| #undef DWFVEC5D | #undef DWFVEC5D | ||||||
|  |  | ||||||
| static Vector<vComplexF> signsF; | static std::vector<vComplexF> signsF; | ||||||
|  |  | ||||||
|   template<typename vtype>     |   template<typename vtype>     | ||||||
|   int setupSigns(Vector<vtype>& signs ){ |   int setupSigns(std::vector<vtype>& signs ){ | ||||||
|     Vector<vtype> bother(2); |     std::vector<vtype> bother(2); | ||||||
|     signs = bother; |     signs = bother; | ||||||
|     vrsign(signs[0]); |     vrsign(signs[0]); | ||||||
|     visign(signs[1]); |     visign(signs[1]); | ||||||
| @@ -364,7 +364,7 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, Doubled | |||||||
|  |  | ||||||
| #include <simd/Intel512double.h> | #include <simd/Intel512double.h> | ||||||
|      |      | ||||||
| static Vector<vComplexD> signsD; | static std::vector<vComplexD> signsD; | ||||||
| static int signInitD = setupSigns(signsD); | static int signInitD = setupSigns(signsD); | ||||||
|      |      | ||||||
| #define MAYBEPERM(A,perm) if (perm) { A ; } | #define MAYBEPERM(A,perm) if (perm) { A ; } | ||||||
|   | |||||||
| @@ -411,6 +411,46 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S | |||||||
| #undef LoopBody | #undef LoopBody | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #ifdef GRID_SYCL | ||||||
|  | extern "C" { | ||||||
|  |     ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void ); | ||||||
|  |     uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void ); | ||||||
|  |     uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_grf_register( uint reg ); | ||||||
|  |     uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_flag_register( uint flag ); | ||||||
|  |     uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_control_register( uint reg ); | ||||||
|  |     uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_hw_thread_id( void ); | ||||||
|  |     uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_slice_id( void ); | ||||||
|  |     uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_subslice_id( void ); | ||||||
|  |     uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_id( void ); | ||||||
|  |     uint  SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_thread_id( void ); | ||||||
|  |     void  SYCL_EXTERNAL __attribute__((overloadable)) intel_eu_thread_pause( uint value ); | ||||||
|  | } | ||||||
|  | #ifdef GRID_SIMT | ||||||
|  | #define MAKE_ID(A) (intel_get_eu_id()<<16)|(intel_get_slice_id()<<8)|(intel_get_subslice_id()) | ||||||
|  | #else | ||||||
|  | #define MAKE_ID(A) (0) | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #else | ||||||
|  |  | ||||||
|  | #define MAKE_ID(A) (0) | ||||||
|  |  | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define KERNEL_CALL_ID(A)						\ | ||||||
|  |   const uint64_t    NN = Nsite*Ls;					\ | ||||||
|  |   accelerator_forNB( ss, NN, Simd::Nsimd(), {				\ | ||||||
|  |       int sF = ss;							\ | ||||||
|  |       int sU = ss/Ls;							\ | ||||||
|  |       WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\ | ||||||
|  |       const int Nsimd = SiteHalfSpinor::Nsimd();			\ | ||||||
|  |       const int lane=acceleratorSIMTlane(Nsimd);                        \ | ||||||
|  |       int idx=sF*Nsimd+lane;						\ | ||||||
|  |       uint64_t id = MAKE_ID();						\ | ||||||
|  |       ids[idx]=id;							\ | ||||||
|  |     });									\ | ||||||
|  |   accelerator_barrier(); | ||||||
|  |  | ||||||
| #define KERNEL_CALLNB(A)						\ | #define KERNEL_CALLNB(A)						\ | ||||||
|   const uint64_t    NN = Nsite*Ls;					\ |   const uint64_t    NN = Nsite*Ls;					\ | ||||||
| @@ -418,7 +458,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S | |||||||
|       int sF = ss;							\ |       int sF = ss;							\ | ||||||
|       int sU = ss/Ls;							\ |       int sU = ss/Ls;							\ | ||||||
|       WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\ |       WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v);		\ | ||||||
|   }); |     }); | ||||||
|  |  | ||||||
| #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); | #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); | ||||||
|  |  | ||||||
| @@ -434,7 +474,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S | |||||||
|  |  | ||||||
| #define ASM_CALL(A)							\ | #define ASM_CALL(A)							\ | ||||||
|   thread_for( sss, Nsite, {						\ |   thread_for( sss, Nsite, {						\ | ||||||
|     int ss = st.lo->Reorder(sss);					\ |     int ss = sss; /*st.lo->Reorder(sss);*/			\ | ||||||
|     int sU = ss;							\ |     int sU = ss;							\ | ||||||
|     int sF = ss*Ls;							\ |     int sF = ss*Ls;							\ | ||||||
|     WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\ |     WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\ | ||||||
| @@ -451,6 +491,8 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S | |||||||
|     WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\ |     WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v);		\ | ||||||
|     });} |     });} | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| template <class Impl> | template <class Impl> | ||||||
| void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf, | void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
| 				     int Ls, int Nsite, const FermionField &in, FermionField &out, | 				     int Ls, int Nsite, const FermionField &in, FermionField &out, | ||||||
| @@ -462,7 +504,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
|     autoView(st_v , st,AcceleratorRead); |     autoView(st_v , st,AcceleratorRead); | ||||||
|  |  | ||||||
|    if( interior && exterior ) { |    if( interior && exterior ) { | ||||||
|      acceleratorFenceComputeStream(); |      //     acceleratorFenceComputeStream(); | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
| @@ -475,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;} |      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;} | ||||||
| #endif | #endif | ||||||
|    } else if( exterior ) { |    } else if( exterior ) { | ||||||
|      // dependent on result of merge |      //     // dependent on result of merge | ||||||
|      acceleratorFenceComputeStream(); |      acceleratorFenceComputeStream(); | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;} |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;} | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;} |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;} | ||||||
| @@ -485,6 +527,18 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
|    } |    } | ||||||
|    assert(0 && " Kernel optimisation case not covered "); |    assert(0 && " Kernel optimisation case not covered "); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  | template <class Impl> | ||||||
|  | void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
|  | 				     int Ls, int Nsite, const FermionField &in, FermionField &out, | ||||||
|  | 				     uint64_t *ids) | ||||||
|  | { | ||||||
|  |     autoView(U_v  ,  U,AcceleratorRead); | ||||||
|  |     autoView(in_v , in,AcceleratorRead); | ||||||
|  |     autoView(out_v,out,AcceleratorWrite); | ||||||
|  |     autoView(st_v , st,AcceleratorRead); | ||||||
|  |     KERNEL_CALL_ID(GenericDhopSite); | ||||||
|  | } | ||||||
|   template <class Impl> |   template <class Impl> | ||||||
|   void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf, |   void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st,  DoubledGaugeField &U, SiteHalfSpinor * buf, | ||||||
| 					  int Ls, int Nsite, const FermionField &in, FermionField &out, | 					  int Ls, int Nsite, const FermionField &in, FermionField &out, | ||||||
|   | |||||||
| @@ -40,6 +40,11 @@ public: | |||||||
|  |  | ||||||
|   INHERIT_GIMPL_TYPES(Gimpl); |   INHERIT_GIMPL_TYPES(Gimpl); | ||||||
|  |  | ||||||
|  |   using Action<GaugeField>::S; | ||||||
|  |   using Action<GaugeField>::Sinitial; | ||||||
|  |   using Action<GaugeField>::deriv; | ||||||
|  |   using Action<GaugeField>::refresh; | ||||||
|  |  | ||||||
| private: | private: | ||||||
|   RealD c_plaq; |   RealD c_plaq; | ||||||
|   RealD c_rect; |   RealD c_rect; | ||||||
|   | |||||||
| @@ -43,6 +43,11 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> { | |||||||
| public:   | public:   | ||||||
|   INHERIT_GIMPL_TYPES(Gimpl); |   INHERIT_GIMPL_TYPES(Gimpl); | ||||||
|  |  | ||||||
|  |   using Action<GaugeField>::S; | ||||||
|  |   using Action<GaugeField>::Sinitial; | ||||||
|  |   using Action<GaugeField>::deriv; | ||||||
|  |   using Action<GaugeField>::refresh; | ||||||
|  |    | ||||||
|   /////////////////////////// constructors |   /////////////////////////// constructors | ||||||
|   explicit WilsonGaugeAction(RealD beta_):beta(beta_){}; |   explicit WilsonGaugeAction(RealD beta_):beta(beta_){}; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -40,7 +40,7 @@ public: | |||||||
|     U = Zero(); |     U = Zero(); | ||||||
|     LatticeColourMatrix tmp(Uin.Grid()); |     LatticeColourMatrix tmp(Uin.Grid()); | ||||||
|  |  | ||||||
|     Vector<typename SU<ncolour>::Matrix> ta(Dimension); |     std::vector<typename SU<ncolour>::Matrix> ta(Dimension); | ||||||
|  |  | ||||||
|     // Debug lines |     // Debug lines | ||||||
|     // LatticeMatrix uno(Uin.Grid()); |     // LatticeMatrix uno(Uin.Grid()); | ||||||
|   | |||||||
| @@ -43,7 +43,7 @@ public: | |||||||
|     U = Zero(); |     U = Zero(); | ||||||
|     LatticeColourMatrix tmp(Uin.Grid()); |     LatticeColourMatrix tmp(Uin.Grid()); | ||||||
|  |  | ||||||
|     Vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension); |     std::vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension); | ||||||
|  |  | ||||||
|     for (int a = 0; a < Dimension; a++) |     for (int a = 0; a < Dimension; a++) | ||||||
|       GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]); |       GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]); | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -971,7 +971,9 @@ void BaryonUtils<FImpl>::BaryonGamma3pt( | |||||||
|   autoView( vq_ti , q_ti     , AcceleratorRead); |   autoView( vq_ti , q_ti     , AcceleratorRead); | ||||||
|   autoView( vq_tf , q_tf     , AcceleratorRead); |   autoView( vq_tf , q_tf     , AcceleratorRead); | ||||||
|  |  | ||||||
|   Vector<mobj> my_Dq_spec{Dq_spec1,Dq_spec2}; |   deviceVector<mobj> my_Dq_spec(2); | ||||||
|  |   acceleratorPut(my_Dq_spec[0],Dq_spec1); | ||||||
|  |   acceleratorPut(my_Dq_spec[1],Dq_spec2); | ||||||
|   mobj * Dq_spec_p = &my_Dq_spec[0]; |   mobj * Dq_spec_p = &my_Dq_spec[0]; | ||||||
|  |  | ||||||
|   if (group == 1) { |   if (group == 1) { | ||||||
| @@ -1300,7 +1302,8 @@ void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop, | |||||||
|   autoView( vd_tf   , qd_tf    , AcceleratorRead); |   autoView( vd_tf   , qd_tf    , AcceleratorRead); | ||||||
|   autoView( vs_ti   , qs_ti    , AcceleratorRead); |   autoView( vs_ti   , qs_ti    , AcceleratorRead); | ||||||
|  |  | ||||||
|   Vector<mobj> my_Dq_spec{Du_spec}; |   deviceVector<mobj> my_Dq_spec(1); | ||||||
|  |   acceleratorPut(my_Dq_spec[0],Du_spec); | ||||||
|   mobj * Dq_spec_p = &my_Dq_spec[0]; |   mobj * Dq_spec_p = &my_Dq_spec[0]; | ||||||
|  |  | ||||||
|   if(op == "Q1"){ |   if(op == "Q1"){ | ||||||
| @@ -1353,7 +1356,8 @@ void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti, | |||||||
|   autoView( vd_tf , qd_tf    , AcceleratorRead  ); |   autoView( vd_tf , qd_tf    , AcceleratorRead  ); | ||||||
|   autoView( vs_ti , qs_ti    , AcceleratorRead  ); |   autoView( vs_ti , qs_ti    , AcceleratorRead  ); | ||||||
|    |    | ||||||
|   Vector<mobj> my_Dq_spec{Du_spec}; |   deviceVector<mobj> my_Dq_spec(1); | ||||||
|  |   acceleratorPut(my_Dq_spec[0],Du_spec); | ||||||
|   mobj * Dq_spec_p = &my_Dq_spec[0]; |   mobj * Dq_spec_p = &my_Dq_spec[0]; | ||||||
|  |  | ||||||
|   if(op == "Q1"){ |   if(op == "Q1"){ | ||||||
| @@ -1544,7 +1548,9 @@ void BaryonUtils<FImpl>::XiToSigmaEye(const PropagatorField &qq_loop, | |||||||
|   autoView( vd_tf   , qd_tf    , AcceleratorRead); |   autoView( vd_tf   , qd_tf    , AcceleratorRead); | ||||||
|   autoView( vs_ti   , qs_ti    , AcceleratorRead); |   autoView( vs_ti   , qs_ti    , AcceleratorRead); | ||||||
|  |  | ||||||
|   Vector<mobj> my_Dq_spec{Dd_spec,Ds_spec}; |   deviceVector<mobj> my_Dq_spec(2); | ||||||
|  |   acceleratorPut(my_Dq_spec[0],Dd_spec); | ||||||
|  |   acceleratorPut(my_Dq_spec[0],Ds_spec); | ||||||
|   mobj * Dq_spec_p = &my_Dq_spec[0]; |   mobj * Dq_spec_p = &my_Dq_spec[0]; | ||||||
|  |  | ||||||
|   if(op == "Q1"){ |   if(op == "Q1"){ | ||||||
|   | |||||||
| @@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) { | |||||||
| //////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////// | ||||||
| // Map a su2 subgroup number to the pair of rows that are non zero | // Map a su2 subgroup number to the pair of rows that are non zero | ||||||
| //////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////// | ||||||
| static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { | static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) { | ||||||
|   assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2)); |   assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2)); | ||||||
|  |  | ||||||
|   int spare = su2_index; |   int spare = su2_index; | ||||||
|   | |||||||
| @@ -62,7 +62,7 @@ public: | |||||||
|     // returns i(T_Adj)^index necessary for the projectors |     // returns i(T_Adj)^index necessary for the projectors | ||||||
|     // see definitions above |     // see definitions above | ||||||
|     iAdjTa = Zero(); |     iAdjTa = Zero(); | ||||||
|     Vector<iSUnMatrix<cplx> > ta(ncolour * ncolour - 1); |     iSUnMatrix<cplx> ta[ncolour * ncolour - 1]; | ||||||
|     iSUnMatrix<cplx> tmp; |     iSUnMatrix<cplx> tmp; | ||||||
|  |  | ||||||
|     // FIXME not very efficient to get all the generators everytime |     // FIXME not very efficient to get all the generators everytime | ||||||
|   | |||||||
| @@ -207,7 +207,7 @@ static void generatorZtype(int zIndex, iGroupMatrix<cplx> &ta) { | |||||||
| // Map a su2 subgroup number to the pair of rows that are non zero | // Map a su2 subgroup number to the pair of rows that are non zero | ||||||
| //////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////// | ||||||
| template <ONLY_IF_Sp> | template <ONLY_IF_Sp> | ||||||
| static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) { | static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) { | ||||||
|   const int nsp=ncolour/2; |   const int nsp=ncolour/2; | ||||||
|   assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2)); |   assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2)); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -72,7 +72,7 @@ public: | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   // Resident in managed memory |   // Resident in managed memory | ||||||
|   Vector<GeneralStencilEntry>  _entries;  |   deviceVector<GeneralStencilEntry>  _entries;  | ||||||
|  |  | ||||||
|   GeneralLocalStencil(GridBase *grid, const std::vector<Coordinate> &shifts) |   GeneralLocalStencil(GridBase *grid, const std::vector<Coordinate> &shifts) | ||||||
|   { |   { | ||||||
| @@ -141,7 +141,7 @@ public: | |||||||
| 	  //////////////////////////////////////////////// | 	  //////////////////////////////////////////////// | ||||||
| 	  // Store in look up table | 	  // Store in look up table | ||||||
| 	  //////////////////////////////////////////////// | 	  //////////////////////////////////////////////// | ||||||
| 	  this->_entries[lex] = SE; | 	  acceleratorPut(this->_entries[lex],SE); | ||||||
| 	} | 	} | ||||||
|       }); |       }); | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -19,7 +19,7 @@ public: | |||||||
|   static int PartialCompressionFactor(GridBase *grid) {return 1;}; |   static int PartialCompressionFactor(GridBase *grid) {return 1;}; | ||||||
|   // Decompress is after merge so ok |   // Decompress is after merge so ok | ||||||
|   template<class vobj,class cobj,class compressor>  |   template<class vobj,class cobj,class compressor>  | ||||||
|   static void Gather_plane_simple (commVector<std::pair<int,int> >& table, |   static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table, | ||||||
| 				   const Lattice<vobj> &rhs, | 				   const Lattice<vobj> &rhs, | ||||||
| 				   cobj *buffer, | 				   cobj *buffer, | ||||||
| 				   compressor &compress, | 				   compressor &compress, | ||||||
| @@ -35,7 +35,7 @@ public: | |||||||
|     rhs_v.ViewClose(); |     rhs_v.ViewClose(); | ||||||
|   } |   } | ||||||
|   template<class vobj,class cobj,class compressor> |   template<class vobj,class cobj,class compressor> | ||||||
|   static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs, |   static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs, | ||||||
| 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask, | 				    std::vector<cobj *> pointers,int dimension,int plane,int cbmask, | ||||||
| 				    compressor &compress,int type,int partial) | 				    compressor &compress,int type,int partial) | ||||||
|   { |   { | ||||||
| @@ -83,25 +83,6 @@ public: | |||||||
| // Wilson compressor will add alternate policies for Dirichlet | // Wilson compressor will add alternate policies for Dirichlet | ||||||
| // and possibly partial Dirichlet for DWF | // and possibly partial Dirichlet for DWF | ||||||
| //////////////////////////////////// | //////////////////////////////////// | ||||||
| /* |  | ||||||
| class FaceGatherDirichlet |  | ||||||
| { |  | ||||||
|   // If it's dirichlet we don't assemble comms buffers |  | ||||||
|   // |  | ||||||
|   // Rely on zeroes in gauge field to drive the correct result |  | ||||||
|   // NAN propgagation: field will locally wrap, so fermion should NOT contain NAN and just permute |  | ||||||
|   template<class vobj,class cobj,class compressor> |  | ||||||
|   static void Gather_plane_simple (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so){}; |  | ||||||
|   template<class vobj,class cobj,class compressor> |  | ||||||
|   static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs, |  | ||||||
| 				   Vector<cobj *> pointers,int dimension,int plane,int cbmask, |  | ||||||
| 				   compressor &compress,int type) {} |  | ||||||
|   template<class decompressor,class Merger> |  | ||||||
|   static void Merge(decompressor decompress,Merge &mm)  {  } |  | ||||||
|   template<class decompressor,class Decompression> |  | ||||||
|   static void Decompress(decompressor decompress,Decompression &dd) {} |  | ||||||
| }; |  | ||||||
| */ |  | ||||||
|  |  | ||||||
| template<class vobj,class FaceGather> | template<class vobj,class FaceGather> | ||||||
| class SimpleCompressorGather : public FaceGather { | class SimpleCompressorGather : public FaceGather { | ||||||
|   | |||||||
| @@ -31,7 +31,6 @@ | |||||||
| #define STENCIL_MAX (16) | #define STENCIL_MAX (16) | ||||||
|  |  | ||||||
| #include <Grid/stencil/SimpleCompressor.h>   // subdir aggregate | #include <Grid/stencil/SimpleCompressor.h>   // subdir aggregate | ||||||
| #include <Grid/stencil/Lebesgue.h>   // subdir aggregate |  | ||||||
| #include <Grid/stencil/GeneralLocalStencil.h> | #include <Grid/stencil/GeneralLocalStencil.h> | ||||||
|  |  | ||||||
| ////////////////////////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -122,17 +121,22 @@ class CartesianStencilAccelerator { | |||||||
|   StencilVector same_node; |   StencilVector same_node; | ||||||
|   Coordinate    _simd_layout; |   Coordinate    _simd_layout; | ||||||
|   Parameters    parameters; |   Parameters    parameters; | ||||||
|  |   ViewMode mode; | ||||||
|   StencilEntry*  _entries_p; |   StencilEntry*  _entries_p; | ||||||
|  |   StencilEntry*  _entries_host_p; | ||||||
|   cobj* u_recv_buf_p; |   cobj* u_recv_buf_p; | ||||||
|   cobj* u_send_buf_p; |   cobj* u_send_buf_p; | ||||||
|  |  | ||||||
|   accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; } |   accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; } | ||||||
|  |  | ||||||
|   accelerator_inline int GetNodeLocal(int osite,int point) const { |   // Not a device function | ||||||
|     return this->_entries_p[point+this->_npoints*osite]._is_local; |   inline int GetNodeLocal(int osite,int point) const { | ||||||
|  |     StencilEntry SE=this->_entries_host_p[point+this->_npoints*osite]; | ||||||
|  |     return SE._is_local; | ||||||
|   } |   } | ||||||
|   accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const { |   accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const { | ||||||
|     ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite]; |     ptype = this->_permute_type[point]; | ||||||
|  |     return & this->_entries_p[point+this->_npoints*osite]; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const { |   accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const { | ||||||
| @@ -165,28 +169,22 @@ class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parame | |||||||
| { | { | ||||||
| public: | public: | ||||||
|   int *closed; |   int *closed; | ||||||
|   StencilEntry *cpu_ptr; |   //  StencilEntry *cpu_ptr; | ||||||
|   ViewMode      mode; |  | ||||||
|  public: |  public: | ||||||
|   // default copy constructor |   // default copy constructor | ||||||
|   CartesianStencilView (const CartesianStencilView &refer_to_me) = default; |   CartesianStencilView (const CartesianStencilView &refer_to_me) = default; | ||||||
|  |  | ||||||
|   CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode _mode) |   CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode _mode) | ||||||
|     : CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me), |     : CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me) | ||||||
|     cpu_ptr(this->_entries_p), |  | ||||||
|     mode(_mode) |  | ||||||
|   { |   { | ||||||
|     this->_entries_p =(StencilEntry *) |     this->ViewOpen(_mode); | ||||||
|       MemoryManager::ViewOpen(this->_entries_p, |   } | ||||||
| 			      this->_npoints*this->_osites*sizeof(StencilEntry), |   void ViewOpen(ViewMode _mode) | ||||||
| 			      mode, |   { | ||||||
| 			      AdviseDefault); |     this->mode = _mode; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void ViewClose(void) |   void ViewClose(void)  {  } | ||||||
|   { |  | ||||||
|     MemoryManager::ViewClose(this->cpu_ptr,this->mode); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
| @@ -256,7 +254,6 @@ protected: | |||||||
|   GridBase *                        _grid; |   GridBase *                        _grid; | ||||||
| public: | public: | ||||||
|   GridBase *Grid(void) const { return _grid; } |   GridBase *Grid(void) const { return _grid; } | ||||||
|   LebesgueOrder *lo; |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   // Needed to conveniently communicate gparity parameters into GPU memory |   // Needed to conveniently communicate gparity parameters into GPU memory | ||||||
| @@ -273,11 +270,11 @@ public: | |||||||
|   int face_table_computed; |   int face_table_computed; | ||||||
|   int partialDirichlet; |   int partialDirichlet; | ||||||
|   int fullDirichlet; |   int fullDirichlet; | ||||||
|   std::vector<commVector<std::pair<int,int> > > face_table ; |   std::vector<deviceVector<std::pair<int,int> > > face_table ; | ||||||
|   Vector<int> surface_list; |   deviceVector<int> surface_list; | ||||||
|  |  | ||||||
|   stencilVector<StencilEntry>  _entries; // Resident in managed memory |   std::vector<StencilEntry>   _entries; // Resident in host memory | ||||||
|   commVector<StencilEntry>     _entries_device; // Resident in device memory |   deviceVector<StencilEntry>  _entries_device; // Resident in device memory | ||||||
|   std::vector<Packet> Packets; |   std::vector<Packet> Packets; | ||||||
|   std::vector<Merge> Mergers; |   std::vector<Merge> Mergers; | ||||||
|   std::vector<Merge> MergersSHM; |   std::vector<Merge> MergersSHM; | ||||||
| @@ -366,12 +363,32 @@ public: | |||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|  |     //    std::cout << "Communicate Begin "<<std::endl; | ||||||
|  |     //    _grid->Barrier(); | ||||||
|  |     FlightRecorder::StepLog("Communicate begin"); | ||||||
|     // All GPU kernel tasks must complete |     // All GPU kernel tasks must complete | ||||||
|     //    accelerator_barrier();     // All kernels should ALREADY be complete |     //    accelerator_barrier();     // All kernels should ALREADY be complete | ||||||
|     //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer |     //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer | ||||||
|                                // But the HaloGather had a barrier too. |                                // But the HaloGather had a barrier too. | ||||||
| #ifdef ACCELERATOR_AWARE_MPI |  | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|  |       //      std::cout << "Communicate prepare "<<i<<std::endl; | ||||||
|  |       //      _grid->Barrier(); | ||||||
|  |       _grid->StencilSendToRecvFromPrepare(MpiReqs, | ||||||
|  | 					  Packets[i].send_buf, | ||||||
|  | 					  Packets[i].to_rank,Packets[i].do_send, | ||||||
|  | 					  Packets[i].recv_buf, | ||||||
|  | 					  Packets[i].from_rank,Packets[i].do_recv, | ||||||
|  | 					  Packets[i].xbytes,Packets[i].rbytes,i); | ||||||
|  |     } | ||||||
|  |     //    std::cout << "Communicate PollDtoH "<<std::endl; | ||||||
|  |     //    _grid->Barrier(); | ||||||
|  |     _grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/ | ||||||
|  |     //    std::cout << "Communicate CopySynch "<<std::endl; | ||||||
|  |     //    _grid->Barrier(); | ||||||
|  |     acceleratorCopySynchronise(); | ||||||
|  |     // Starts intranode | ||||||
|  |     for(int i=0;i<Packets.size();i++){ | ||||||
|  |       //      std::cout << "Communicate Begin "<<i<<std::endl; | ||||||
|       _grid->StencilSendToRecvFromBegin(MpiReqs, |       _grid->StencilSendToRecvFromBegin(MpiReqs, | ||||||
| 					Packets[i].send_buf, | 					Packets[i].send_buf, | ||||||
| 					Packets[i].to_rank,Packets[i].do_send, | 					Packets[i].to_rank,Packets[i].do_send, | ||||||
| @@ -379,23 +396,6 @@ public: | |||||||
| 					Packets[i].from_rank,Packets[i].do_recv, | 					Packets[i].from_rank,Packets[i].do_recv, | ||||||
| 					Packets[i].xbytes,Packets[i].rbytes,i); | 					Packets[i].xbytes,Packets[i].rbytes,i); | ||||||
|     } |     } | ||||||
| #else |  | ||||||
| #warning "Using COPY VIA HOST BUFFERS IN STENCIL" |  | ||||||
|     for(int i=0;i<Packets.size();i++){ |  | ||||||
|       // Introduce a host buffer with a cheap slab allocator and zero cost wipe all |  | ||||||
|       Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes); |  | ||||||
|       Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); |  | ||||||
|       if ( Packets[i].do_send ) { |  | ||||||
| 	acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); |  | ||||||
|       } |  | ||||||
|       _grid->StencilSendToRecvFromBegin(MpiReqs, |  | ||||||
| 					Packets[i].host_send_buf, |  | ||||||
| 					Packets[i].to_rank,Packets[i].do_send, |  | ||||||
| 					Packets[i].host_recv_buf, |  | ||||||
| 					Packets[i].from_rank,Packets[i].do_recv, |  | ||||||
| 					Packets[i].xbytes,Packets[i].rbytes,i); |  | ||||||
|     } |  | ||||||
| #endif |  | ||||||
|     // Get comms started then run checksums |     // Get comms started then run checksums | ||||||
|     // Having this PRIOR to the dslash seems to make Sunspot work... (!) |     // Having this PRIOR to the dslash seems to make Sunspot work... (!) | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
| @@ -406,27 +406,25 @@ public: | |||||||
|  |  | ||||||
|   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|  |     //    std::cout << "Communicate Complete "<<std::endl; | ||||||
|  |     //    _grid->Barrier(); | ||||||
|  |     FlightRecorder::StepLog("Start communicate complete"); | ||||||
|  |     //    std::cout << "Communicate Complete PollIRecv "<<std::endl; | ||||||
|  |     //    _grid->Barrier(); | ||||||
|  |     _grid->StencilSendToRecvFromPollIRecv(MpiReqs); | ||||||
|  |     //    std::cout << "Communicate Complete Complete "<<std::endl; | ||||||
|  |     //    _grid->Barrier(); | ||||||
|     _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done |     _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done | ||||||
|     if   ( this->partialDirichlet ) DslashLogPartial(); |     if   ( this->partialDirichlet ) DslashLogPartial(); | ||||||
|     else if ( this->fullDirichlet ) DslashLogDirichlet(); |     else if ( this->fullDirichlet ) DslashLogDirichlet(); | ||||||
|     else DslashLogFull(); |     else DslashLogFull(); | ||||||
|     // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete |     //    acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete | ||||||
|     //    accelerator_barrier();  |     //    accelerator_barrier();  | ||||||
|     _grid->StencilBarrier();  |  | ||||||
| #ifndef ACCELERATOR_AWARE_MPI |  | ||||||
| #warning "Using COPY VIA HOST BUFFERS IN STENCIL" |  | ||||||
|     for(int i=0;i<Packets.size();i++){ |  | ||||||
|       if ( Packets[i].do_recv ) { |  | ||||||
| 	acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     _grid->HostBufferFreeAll(); |  | ||||||
| #endif |  | ||||||
|     // run any checksums |  | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|       if ( Packets[i].do_recv ) |       if ( Packets[i].do_recv ) | ||||||
| 	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank); | 	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank); | ||||||
|     } |     } | ||||||
|  |     FlightRecorder::StepLog("Finish communicate complete"); | ||||||
|   } |   } | ||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   // Blocking send and receive. Either sequential or parallel. |   // Blocking send and receive. Either sequential or parallel. | ||||||
| @@ -503,6 +501,9 @@ public: | |||||||
|   void HaloGather(const Lattice<vobj> &source,compressor &compress) |   void HaloGather(const Lattice<vobj> &source,compressor &compress) | ||||||
|   { |   { | ||||||
|     //    accelerator_barrier(); |     //    accelerator_barrier(); | ||||||
|  |     ////////////////////////////////// | ||||||
|  |     // I will overwrite my send buffers | ||||||
|  |     ////////////////////////////////// | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes |     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||||
|  |  | ||||||
|     assert(source.Grid()==_grid); |     assert(source.Grid()==_grid); | ||||||
| @@ -516,6 +517,12 @@ public: | |||||||
|       HaloGatherDir(source,compress,point,face_idx); |       HaloGatherDir(source,compress,point,face_idx); | ||||||
|     } |     } | ||||||
|     accelerator_barrier(); // All my local gathers are complete |     accelerator_barrier(); // All my local gathers are complete | ||||||
|  | #ifdef NVLINK_GET | ||||||
|  |     #warning "NVLINK_GET" | ||||||
|  |     _grid->StencilBarrier(); // He can now get mu local gather, I can get his | ||||||
|  |     // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check | ||||||
|  |     // Or issue barrier AFTER the DMA is running | ||||||
|  | #endif     | ||||||
|     face_table_computed=1; |     face_table_computed=1; | ||||||
|     assert(u_comm_offset==_unified_buffer_size); |     assert(u_comm_offset==_unified_buffer_size); | ||||||
|   } |   } | ||||||
| @@ -554,6 +561,7 @@ public: | |||||||
| 	  coalescedWrite(to[j] ,coalescedRead(from [j])); | 	  coalescedWrite(to[j] ,coalescedRead(from [j])); | ||||||
|       }); |       }); | ||||||
|       acceleratorFenceComputeStream(); |       acceleratorFenceComputeStream(); | ||||||
|  |       // Also fenced in WilsonKernels | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|    |    | ||||||
| @@ -651,10 +659,10 @@ public: | |||||||
|   //////////////////////////////////////// |   //////////////////////////////////////// | ||||||
|   void PrecomputeByteOffsets(void){ |   void PrecomputeByteOffsets(void){ | ||||||
|     for(int i=0;i<_entries.size();i++){ |     for(int i=0;i<_entries.size();i++){ | ||||||
|       if( _entries[i]._is_local ) { |       if( this->_entries[i]._is_local ) { | ||||||
| 	_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj); | 	this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(vobj); | ||||||
|       } else { |       } else { | ||||||
| 	_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj); | 	this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(cobj); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
| @@ -668,7 +676,7 @@ public: | |||||||
|     for(int point=0;point<this->_npoints;point++){ |     for(int point=0;point<this->_npoints;point++){ | ||||||
|       this->same_node[point] = this->SameNode(point); |       this->same_node[point] = this->SameNode(point); | ||||||
|     } |     } | ||||||
|  |     int32_t surface_list_size=0; | ||||||
|     for(int site = 0 ;site< vol4;site++){ |     for(int site = 0 ;site< vol4;site++){ | ||||||
|       int local = 1; |       int local = 1; | ||||||
|       for(int point=0;point<this->_npoints;point++){ |       for(int point=0;point<this->_npoints;point++){ | ||||||
| @@ -678,11 +686,30 @@ public: | |||||||
|       } |       } | ||||||
|       if(local == 0) { |       if(local == 0) { | ||||||
| 	for(int s=0;s<Ls;s++){ | 	for(int s=0;s<Ls;s++){ | ||||||
| 	  surface_list.push_back(site*Ls+s); | 	  surface_list_size++; | ||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     //std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl; |     surface_list.resize(surface_list_size); | ||||||
|  |     std::vector<int> surface_list_host(surface_list_size); | ||||||
|  |     int32_t ss=0; | ||||||
|  |     for(int site = 0 ;site< vol4;site++){ | ||||||
|  |       int local = 1; | ||||||
|  |       for(int point=0;point<this->_npoints;point++){ | ||||||
|  | 	if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ | ||||||
|  | 	  local = 0; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |       if(local == 0) { | ||||||
|  | 	for(int s=0;s<Ls;s++){ | ||||||
|  | 	  int idx=site*Ls+s; | ||||||
|  | 	  surface_list_host[ss]= idx; | ||||||
|  | 	  ss++; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int)); | ||||||
|  |     std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl; | ||||||
|   } |   } | ||||||
|   /// Introduce a block structure and switch off comms on boundaries |   /// Introduce a block structure and switch off comms on boundaries | ||||||
|   void DirichletBlock(const Coordinate &dirichlet_block) |   void DirichletBlock(const Coordinate &dirichlet_block) | ||||||
| @@ -770,7 +797,13 @@ public: | |||||||
|     this->_osites  = _grid->oSites(); |     this->_osites  = _grid->oSites(); | ||||||
|  |  | ||||||
|     _entries.resize(this->_npoints* this->_osites); |     _entries.resize(this->_npoints* this->_osites); | ||||||
|     this->_entries_p = &_entries[0]; |     _entries_device.resize(this->_npoints* this->_osites); | ||||||
|  |     this->_entries_host_p = &_entries[0]; | ||||||
|  |     this->_entries_p = &_entries_device[0]; | ||||||
|  |  | ||||||
|  |     std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites | ||||||
|  | 	      <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl; | ||||||
|  |      | ||||||
|     for(int ii=0;ii<npoints;ii++){ |     for(int ii=0;ii<npoints;ii++){ | ||||||
|  |  | ||||||
|       int i = ii; // reverse direction to get SIMD comms done first |       int i = ii; // reverse direction to get SIMD comms done first | ||||||
| @@ -847,6 +880,7 @@ public: | |||||||
|       u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); |       u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj)); | ||||||
|     } |     } | ||||||
|     PrecomputeByteOffsets(); |     PrecomputeByteOffsets(); | ||||||
|  |     acceleratorCopyToDevice(&this->_entries[0],&this->_entries_device[0],this->_entries.size()*sizeof(StencilEntry)); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void Local     (int point, int dimension,int shiftpm,int cbmask) |   void Local     (int point, int dimension,int shiftpm,int cbmask) | ||||||
| @@ -1002,10 +1036,10 @@ public: | |||||||
|       for(int n=0;n<_grid->_slice_nblock[dimension];n++){ |       for(int n=0;n<_grid->_slice_nblock[dimension];n++){ | ||||||
| 	for(int b=0;b<_grid->_slice_block[dimension];b++){ | 	for(int b=0;b<_grid->_slice_block[dimension];b++){ | ||||||
| 	  int idx=point+(lo+o+b)*this->_npoints; | 	  int idx=point+(lo+o+b)*this->_npoints; | ||||||
| 	  _entries[idx]._offset  =ro+o+b; | 	  this->_entries[idx]._offset  =ro+o+b; | ||||||
| 	  _entries[idx]._permute=permute; | 	  this->_entries[idx]._permute=permute; | ||||||
| 	  _entries[idx]._is_local=1; | 	  this->_entries[idx]._is_local=1; | ||||||
| 	  _entries[idx]._around_the_world=wrap; | 	  this->_entries[idx]._around_the_world=wrap; | ||||||
| 	} | 	} | ||||||
| 	o +=_grid->_slice_stride[dimension]; | 	o +=_grid->_slice_stride[dimension]; | ||||||
|       } |       } | ||||||
| @@ -1023,10 +1057,10 @@ public: | |||||||
|  |  | ||||||
| 	  if ( ocb&cbmask ) { | 	  if ( ocb&cbmask ) { | ||||||
| 	    int idx = point+(lo+o+b)*this->_npoints; | 	    int idx = point+(lo+o+b)*this->_npoints; | ||||||
| 	    _entries[idx]._offset =ro+o+b; | 	    this->_entries[idx]._offset =ro+o+b; | ||||||
| 	    _entries[idx]._is_local=1; | 	    this->_entries[idx]._is_local=1; | ||||||
| 	    _entries[idx]._permute=permute; | 	    this->_entries[idx]._permute=permute; | ||||||
| 	    _entries[idx]._around_the_world=wrap; | 	    this->_entries[idx]._around_the_world=wrap; | ||||||
| 	  } | 	  } | ||||||
|  |  | ||||||
| 	} | 	} | ||||||
| @@ -1050,10 +1084,10 @@ public: | |||||||
|       for(int n=0;n<_grid->_slice_nblock[dimension];n++){ |       for(int n=0;n<_grid->_slice_nblock[dimension];n++){ | ||||||
| 	for(int b=0;b<_grid->_slice_block[dimension];b++){ | 	for(int b=0;b<_grid->_slice_block[dimension];b++){ | ||||||
| 	  int idx=point+(so+o+b)*this->_npoints; | 	  int idx=point+(so+o+b)*this->_npoints; | ||||||
| 	  _entries[idx]._offset  =offset+(bo++); | 	  this->_entries[idx]._offset  =offset+(bo++); | ||||||
| 	  _entries[idx]._is_local=0; | 	  this->_entries[idx]._is_local=0; | ||||||
| 	  _entries[idx]._permute=0; | 	  this->_entries[idx]._permute=0; | ||||||
| 	  _entries[idx]._around_the_world=wrap; | 	  this->_entries[idx]._around_the_world=wrap; | ||||||
| 	} | 	} | ||||||
| 	o +=_grid->_slice_stride[dimension]; | 	o +=_grid->_slice_stride[dimension]; | ||||||
|       } |       } | ||||||
| @@ -1070,10 +1104,10 @@ public: | |||||||
| 	  int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | 	  int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup | ||||||
| 	  if ( ocb & cbmask ) { | 	  if ( ocb & cbmask ) { | ||||||
| 	    int idx = point+(so+o+b)*this->_npoints; | 	    int idx = point+(so+o+b)*this->_npoints; | ||||||
| 	    _entries[idx]._offset  =offset+(bo++); | 	    this->_entries[idx]._offset  =offset+(bo++); | ||||||
| 	    _entries[idx]._is_local=0; | 	    this->_entries[idx]._is_local=0; | ||||||
| 	    _entries[idx]._permute =0; | 	    this->_entries[idx]._permute =0; | ||||||
| 	    _entries[idx]._around_the_world=wrap; | 	    this->_entries[idx]._around_the_world=wrap; | ||||||
| 	  } | 	  } | ||||||
| 	} | 	} | ||||||
| 	o +=_grid->_slice_stride[dimension]; | 	o +=_grid->_slice_stride[dimension]; | ||||||
|   | |||||||
| @@ -202,15 +202,15 @@ void acceleratorInit(void) | |||||||
|  |  | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|  |  | ||||||
| cl::sycl::queue *theGridAccelerator; | sycl::queue *theGridAccelerator; | ||||||
| cl::sycl::queue *theCopyAccelerator; | sycl::queue *theCopyAccelerator; | ||||||
| void acceleratorInit(void) | void acceleratorInit(void) | ||||||
| { | { | ||||||
|   int nDevices = 1; |   int nDevices = 1; | ||||||
|   cl::sycl::gpu_selector selector; |   //  sycl::gpu_selector selector; | ||||||
|   cl::sycl::device selectedDevice { selector }; |   //  sycl::device selectedDevice { selector }; | ||||||
|   theGridAccelerator = new sycl::queue (selectedDevice); |   theGridAccelerator = new sycl::queue (sycl::gpu_selector_v); | ||||||
|   theCopyAccelerator = new sycl::queue (selectedDevice); |   theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v); | ||||||
|   //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. |   //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. | ||||||
|  |  | ||||||
| #ifdef GRID_SYCL_LEVEL_ZERO_IPC | #ifdef GRID_SYCL_LEVEL_ZERO_IPC | ||||||
| @@ -242,14 +242,14 @@ void acceleratorInit(void) | |||||||
|   gethostname(hostname, HOST_NAME_MAX+1); |   gethostname(hostname, HOST_NAME_MAX+1); | ||||||
|   if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname); |   if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname); | ||||||
|  |  | ||||||
|   auto devices = cl::sycl::device::get_devices(); |   auto devices = sycl::device::get_devices(); | ||||||
|   for(int d = 0;d<devices.size();d++){ |   for(int d = 0;d<devices.size();d++){ | ||||||
|  |  | ||||||
| #define GPU_PROP_STR(prop) \ | #define GPU_PROP_STR(prop) \ | ||||||
|     printf("AcceleratorSyclInit:   " #prop ": %s \n",devices[d].get_info<cl::sycl::info::device::prop>().c_str()); |     printf("AcceleratorSyclInit:   " #prop ": %s \n",devices[d].get_info<sycl::info::device::prop>().c_str()); | ||||||
|  |  | ||||||
| #define GPU_PROP_FMT(prop,FMT) \ | #define GPU_PROP_FMT(prop,FMT) \ | ||||||
|     printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>()); |     printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<sycl::info::device::prop>()); | ||||||
|  |  | ||||||
| #define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld"); | #define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld"); | ||||||
|     if ( world_rank == 0) { |     if ( world_rank == 0) { | ||||||
|   | |||||||
| @@ -132,27 +132,17 @@ inline void cuda_mem(void) | |||||||
|  |  | ||||||
| #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\ | #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\ | ||||||
|   {									\ |   {									\ | ||||||
|     int nt=acceleratorThreads();					\ |     if ( num1*num2 ) {							\ | ||||||
|     typedef uint64_t Iterator;						\ |       int nt=acceleratorThreads();					\ | ||||||
|     auto lambda = [=] accelerator					\ |       typedef uint64_t Iterator;					\ | ||||||
|       (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\ |       auto lambda = [=] accelerator					\ | ||||||
|       __VA_ARGS__;							\ | 	(Iterator iter1,Iterator iter2,Iterator lane) mutable {		\ | ||||||
|     };									\ | 		      __VA_ARGS__;					\ | ||||||
|     dim3 cu_threads(nsimd,acceleratorThreads(),1);			\ | 		    };							\ | ||||||
|     dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\ |       dim3 cu_threads(nsimd,acceleratorThreads(),1);			\ | ||||||
|     LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\ |       dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\ | ||||||
|   } |       LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \ | ||||||
| #define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\ |     }									\ | ||||||
|   {									\ |  | ||||||
|     int nt=acceleratorThreads();					\ |  | ||||||
|     typedef uint64_t Iterator;						\ |  | ||||||
|     auto lambda = [=] accelerator					\ |  | ||||||
|       (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\ |  | ||||||
|       __VA_ARGS__;							\ |  | ||||||
|     };									\ |  | ||||||
|     dim3 cu_threads(nsimd,acceleratorThreads(),1);			\ |  | ||||||
|     dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\ |  | ||||||
|     ProfileLambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \ |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
| #define accelerator_for6dNB(iter1, num1,				\ | #define accelerator_for6dNB(iter1, num1,				\ | ||||||
| @@ -175,19 +165,6 @@ inline void cuda_mem(void) | |||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
| #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\ |  | ||||||
|   {									\ |  | ||||||
|     int nt=acceleratorThreads();					\ |  | ||||||
|     typedef uint64_t Iterator;						\ |  | ||||||
|     auto lambda = [=] accelerator					\ |  | ||||||
|       (Iterator iter1,Iterator iter2,Iterator lane) mutable {		\ |  | ||||||
|       __VA_ARGS__;							\ |  | ||||||
|     };									\ |  | ||||||
|     dim3 cu_threads(nsimd,acceleratorThreads(),1);			\ |  | ||||||
|     dim3 cu_blocks ((num1+nt-1)/nt,num2,1);				\ |  | ||||||
|     LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda);	\ |  | ||||||
|   } |  | ||||||
|  |  | ||||||
| template<typename lambda>  __global__ | template<typename lambda>  __global__ | ||||||
| void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) | void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) | ||||||
| { | { | ||||||
| @@ -199,17 +176,6 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) | |||||||
|     Lambda(x,y,z); |     Lambda(x,y,z); | ||||||
|   } |   } | ||||||
| } | } | ||||||
| template<typename lambda>  __global__ |  | ||||||
| void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) |  | ||||||
| { |  | ||||||
|   // Weird permute is to make lane coalesce for large blocks |  | ||||||
|   uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; |  | ||||||
|   uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; |  | ||||||
|   uint64_t z = threadIdx.x; |  | ||||||
|   if ( (x < num1) && (y<num2) && (z<num3) ) { |  | ||||||
|     Lambda(x,y,z); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<typename lambda>  __global__ | template<typename lambda>  __global__ | ||||||
| void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, | void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, | ||||||
| @@ -243,6 +209,17 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, | |||||||
|     }									\ |     }									\ | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  | inline void *acceleratorAllocHost(size_t bytes) | ||||||
|  | { | ||||||
|  |   void *ptr=NULL; | ||||||
|  |   auto err = cudaMallocHost((void **)&ptr,bytes); | ||||||
|  |   if( err != cudaSuccess ) { | ||||||
|  |     ptr = (void *) NULL; | ||||||
|  |     printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err)); | ||||||
|  |     assert(0); | ||||||
|  |   } | ||||||
|  |   return ptr; | ||||||
|  | } | ||||||
| inline void *acceleratorAllocShared(size_t bytes) | inline void *acceleratorAllocShared(size_t bytes) | ||||||
| { | { | ||||||
|   void *ptr=NULL; |   void *ptr=NULL; | ||||||
| @@ -264,8 +241,10 @@ inline void *acceleratorAllocDevice(size_t bytes) | |||||||
|   } |   } | ||||||
|   return ptr; |   return ptr; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; | inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; | ||||||
| inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; | inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; | ||||||
|  | inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);}; | ||||||
| inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} | inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} | ||||||
| inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} | inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} | ||||||
| inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);} | inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);} | ||||||
| @@ -302,7 +281,7 @@ NAMESPACE_END(Grid); | |||||||
|  |  | ||||||
| // Force deterministic reductions | // Force deterministic reductions | ||||||
| #define SYCL_REDUCTION_DETERMINISTIC | #define SYCL_REDUCTION_DETERMINISTIC | ||||||
| #include <sycl/CL/sycl.hpp> | #include <sycl/sycl.hpp> | ||||||
| #include <sycl/usm.hpp> | #include <sycl/usm.hpp> | ||||||
| #include <level_zero/ze_api.h> | #include <level_zero/ze_api.h> | ||||||
| #include <sycl/ext/oneapi/backend/level_zero.hpp> | #include <sycl/ext/oneapi/backend/level_zero.hpp> | ||||||
| @@ -314,8 +293,8 @@ inline void acceleratorMem(void) | |||||||
|   std::cout <<" SYCL acceleratorMem not implemented"<<std::endl; |   std::cout <<" SYCL acceleratorMem not implemented"<<std::endl; | ||||||
| } | } | ||||||
|  |  | ||||||
| extern cl::sycl::queue *theGridAccelerator; | extern sycl::queue *theGridAccelerator; | ||||||
| extern cl::sycl::queue *theCopyAccelerator; | extern sycl::queue *theCopyAccelerator; | ||||||
|  |  | ||||||
| #ifdef __SYCL_DEVICE_ONLY__ | #ifdef __SYCL_DEVICE_ONLY__ | ||||||
| #define GRID_SIMT | #define GRID_SIMT | ||||||
| @@ -326,24 +305,24 @@ extern cl::sycl::queue *theCopyAccelerator; | |||||||
|  |  | ||||||
| accelerator_inline int acceleratorSIMTlane(int Nsimd) { | accelerator_inline int acceleratorSIMTlane(int Nsimd) { | ||||||
| #ifdef GRID_SIMT | #ifdef GRID_SIMT | ||||||
|  return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2];  |  return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2];  | ||||||
| #else | #else | ||||||
|  return 0; |  return 0; | ||||||
| #endif | #endif | ||||||
| } // SYCL specific | } // SYCL specific | ||||||
|  |  | ||||||
| #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\ | #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\ | ||||||
|   theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\ |   theGridAccelerator->submit([&](sycl::handler &cgh) {		\ | ||||||
|     unsigned long nt=acceleratorThreads();				\ |     unsigned long nt=acceleratorThreads();				\ | ||||||
|     if(nt < 8)nt=8;							\ |     if(nt < 8)nt=8;							\ | ||||||
|     unsigned long unum1 = num1;						\ |     unsigned long unum1 = num1;						\ | ||||||
|     unsigned long unum2 = num2;						\ |     unsigned long unum2 = num2;						\ | ||||||
|     unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\ |     unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\ | ||||||
|     cl::sycl::range<3> local {nt,1,nsimd};				\ |     sycl::range<3> local {nt,1,nsimd};				\ | ||||||
|     cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\ |     sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\ | ||||||
|     cgh.parallel_for(							\ |     cgh.parallel_for(							\ | ||||||
| 		     cl::sycl::nd_range<3>(global,local),		\ | 		     sycl::nd_range<3>(global,local),			\ | ||||||
| 		     [=] (cl::sycl::nd_item<3> item) /*mutable*/	\ | 		     [=] (sycl::nd_item<3> item) /*mutable*/		\ | ||||||
| 		     [[intel::reqd_sub_group_size(16)]]			\ | 		     [[intel::reqd_sub_group_size(16)]]			\ | ||||||
| 		     {							\ | 		     {							\ | ||||||
| 		       auto iter1    = item.get_global_id(0);		\ | 		       auto iter1    = item.get_global_id(0);		\ | ||||||
| @@ -356,12 +335,34 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { | |||||||
| #define accelerator_barrier(dummy) { theGridAccelerator->wait(); } | #define accelerator_barrier(dummy) { theGridAccelerator->wait(); } | ||||||
|  |  | ||||||
| inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; | inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; | ||||||
|  | inline void *acceleratorAllocHost(size_t bytes)  { return malloc_host(bytes,*theGridAccelerator);}; | ||||||
| inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; | inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; | ||||||
|  | inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);}; | ||||||
| inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; | inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; | ||||||
| inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; | inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; | ||||||
|  |  | ||||||
| inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); } | inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); } | ||||||
| inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);} |  | ||||||
|  |  | ||||||
|  | /////// | ||||||
|  | // Asynch event interface | ||||||
|  | /////// | ||||||
|  | typedef sycl::event acceleratorEvent_t; | ||||||
|  |  | ||||||
|  | inline void acceleratorEventWait(acceleratorEvent_t ev) | ||||||
|  | { | ||||||
|  |   ev.wait(); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | inline int acceleratorEventIsComplete(acceleratorEvent_t ev) | ||||||
|  | { | ||||||
|  |   return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { return theCopyAccelerator->memcpy(to,from,bytes);} | ||||||
|  | inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); } | ||||||
|  | inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); } | ||||||
|  |  | ||||||
| inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} | inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} | ||||||
| inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} | inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} | ||||||
| inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} | inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} | ||||||
| @@ -369,13 +370,15 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccele | |||||||
| inline int  acceleratorIsCommunicable(void *ptr) | inline int  acceleratorIsCommunicable(void *ptr) | ||||||
| { | { | ||||||
| #if 0 | #if 0 | ||||||
|   auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context()); |   auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context()); | ||||||
|   if ( uvm = cl::sycl::usm::alloc::shared ) return 1; |   if ( uvm = sycl::usm::alloc::shared ) return 1; | ||||||
|   else return 0; |   else return 0; | ||||||
| #endif | #endif | ||||||
|   return 1; |   return 1; | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| ////////////////////////////////////////////// | ////////////////////////////////////////////// | ||||||
| @@ -472,6 +475,16 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) | |||||||
|     }								\ |     }								\ | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  | inline void *acceleratorAllocHost(size_t bytes) | ||||||
|  | { | ||||||
|  |   void *ptr=NULL; | ||||||
|  |   auto err = hipMallocHost((void **)&ptr,bytes); | ||||||
|  |   if( err != hipSuccess ) { | ||||||
|  |     ptr = (void *) NULL; | ||||||
|  |     fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr); | ||||||
|  |   } | ||||||
|  |   return ptr; | ||||||
|  | }; | ||||||
| inline void *acceleratorAllocShared(size_t bytes) | inline void *acceleratorAllocShared(size_t bytes) | ||||||
| { | { | ||||||
|   void *ptr=NULL; |   void *ptr=NULL; | ||||||
| @@ -495,12 +508,12 @@ inline void *acceleratorAllocDevice(size_t bytes) | |||||||
|   return ptr; |   return ptr; | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);}; | ||||||
| inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);}; | inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);}; | ||||||
| inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);}; | inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);}; | ||||||
| inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} | inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} | ||||||
| inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} | inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} | ||||||
| //inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} |  | ||||||
| //inline void acceleratorCopySynchronise(void) {  } |  | ||||||
| inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);} | inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);} | ||||||
|  |  | ||||||
| inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch | inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch | ||||||
| @@ -517,15 +530,19 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize | |||||||
|  |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | inline void acceleratorPin(void *ptr,unsigned long bytes) | ||||||
|  | { | ||||||
|  | #ifdef GRID_SYCL | ||||||
|  |   sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context()); | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  |  | ||||||
| ////////////////////////////////////////////// | ////////////////////////////////////////////// | ||||||
| // Common on all GPU targets | // Common on all GPU targets | ||||||
| ////////////////////////////////////////////// | ////////////////////////////////////////////// | ||||||
| #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP) | #if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP) | ||||||
| // FIXME -- the non-blocking nature got broken March 30 2023 by PAB | // FIXME -- the non-blocking nature got broken March 30 2023 by PAB | ||||||
| #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );   | #define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );   | ||||||
| #define prof_accelerator_for( iter1, num1, nsimd, ... ) \ |  | ||||||
|   prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\ |  | ||||||
|   accelerator_barrier(dummy); |  | ||||||
|  |  | ||||||
| #define accelerator_for( iter, num, nsimd, ... )		\ | #define accelerator_for( iter, num, nsimd, ... )		\ | ||||||
|   accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } );	\ |   accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } );	\ | ||||||
| @@ -574,8 +591,10 @@ inline void acceleratorCopySynchronise(void) {}; | |||||||
| inline int  acceleratorIsCommunicable(void *ptr){ return 1; } | inline int  acceleratorIsCommunicable(void *ptr){ return 1; } | ||||||
| inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} | inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} | ||||||
| #ifdef HAVE_MM_MALLOC_H | #ifdef HAVE_MM_MALLOC_H | ||||||
|  | inline void *acceleratorAllocHost(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; | ||||||
| inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; | inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; | ||||||
| inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; | inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; | ||||||
|  | inline void acceleratorFreeHost(void *ptr){_mm_free(ptr);}; | ||||||
| inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);}; | inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);}; | ||||||
| inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);}; | inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);}; | ||||||
| #else | #else | ||||||
|   | |||||||
| @@ -39,6 +39,8 @@ int FlightRecorder::ContinueOnFail; | |||||||
| int FlightRecorder::LoggingMode; | int FlightRecorder::LoggingMode; | ||||||
| int FlightRecorder::ChecksumComms; | int FlightRecorder::ChecksumComms; | ||||||
| int FlightRecorder::ChecksumCommsSend; | int FlightRecorder::ChecksumCommsSend; | ||||||
|  | const char *   FlightRecorder::StepName; | ||||||
|  | int32_t  FlightRecorder::StepLoggingCounter; | ||||||
| int32_t  FlightRecorder::XmitLoggingCounter; | int32_t  FlightRecorder::XmitLoggingCounter; | ||||||
| int32_t  FlightRecorder::RecvLoggingCounter; | int32_t  FlightRecorder::RecvLoggingCounter; | ||||||
| int32_t  FlightRecorder::CsumLoggingCounter; | int32_t  FlightRecorder::CsumLoggingCounter; | ||||||
| @@ -58,6 +60,8 @@ void FlightRecorder::ResetCounters(void) | |||||||
|   CsumLoggingCounter=0; |   CsumLoggingCounter=0; | ||||||
|   NormLoggingCounter=0; |   NormLoggingCounter=0; | ||||||
|   ReductionLoggingCounter=0; |   ReductionLoggingCounter=0; | ||||||
|  |   StepName = "No steps started"; | ||||||
|  |   StepLoggingCounter=0; | ||||||
| } | } | ||||||
| void FlightRecorder::Truncate(void) | void FlightRecorder::Truncate(void) | ||||||
| { | { | ||||||
| @@ -88,6 +92,12 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode) | |||||||
|     assert(0); |     assert(0); | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  | bool FlightRecorder::StepLog(const char *name) | ||||||
|  | { | ||||||
|  |   StepName = name; | ||||||
|  |   StepLoggingCounter ++; | ||||||
|  |   return true; | ||||||
|  | } | ||||||
|  |  | ||||||
| void FlightRecorder::SetLoggingModePrint(void) | void FlightRecorder::SetLoggingModePrint(void) | ||||||
| { | { | ||||||
| @@ -111,17 +121,19 @@ uint64_t FlightRecorder::ErrorCount(void) | |||||||
| { | { | ||||||
|   return ErrorCounter; |   return ErrorCounter; | ||||||
| } | } | ||||||
| void FlightRecorder::NormLog(double value) | bool FlightRecorder::NormLog(double value) | ||||||
| { | { | ||||||
|   uint64_t hex = * ( (uint64_t *)&value ); |   uint64_t hex = * ( (uint64_t *)&value ); | ||||||
|   if(LoggingMode == LoggingModePrint) { |   if(LoggingMode == LoggingModePrint) { | ||||||
|     std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; |     std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; | ||||||
|     NormLoggingCounter++; |     NormLoggingCounter++; | ||||||
|  |     return true; | ||||||
|   } |   } | ||||||
|   if(LoggingMode == LoggingModeRecord) { |   if(LoggingMode == LoggingModeRecord) { | ||||||
|     std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; |     std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; | ||||||
|     NormLogVector.push_back(value); |     NormLogVector.push_back(value); | ||||||
|     NormLoggingCounter++; |     NormLoggingCounter++; | ||||||
|  |     return true; | ||||||
|   } |   } | ||||||
|   if(LoggingMode == LoggingModeVerify) { |   if(LoggingMode == LoggingModeVerify) { | ||||||
|  |  | ||||||
| @@ -130,6 +142,9 @@ void FlightRecorder::NormLog(double value) | |||||||
|  |  | ||||||
|       if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) { |       if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) { | ||||||
|  |  | ||||||
|  | 	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n", | ||||||
|  | 		FlightRecorder::StepLoggingCounter, | ||||||
|  | 		FlightRecorder::StepName); | ||||||
| 	std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter | 	std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter | ||||||
| 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" " | 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" " | ||||||
| 		 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl; | 		 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl; | ||||||
| @@ -142,7 +157,9 @@ void FlightRecorder::NormLog(double value) | |||||||
| 		NormLoggingCounter,NormLogVector.size(), | 		NormLoggingCounter,NormLogVector.size(), | ||||||
| 		value, NormLogVector[NormLoggingCounter]); fflush(stderr); | 		value, NormLogVector[NormLoggingCounter]); fflush(stderr); | ||||||
|  |  | ||||||
| 	if(!ContinueOnFail)assert(0); // Force takedown of job | 	BACKTRACEFP(stderr); | ||||||
|  |  | ||||||
|  | 	if(!ContinueOnFail) return false; | ||||||
| 	   | 	   | ||||||
| 	ErrorCounter++; | 	ErrorCounter++; | ||||||
|       } else { |       } else { | ||||||
| @@ -159,18 +176,21 @@ void FlightRecorder::NormLog(double value) | |||||||
|     } |     } | ||||||
|     NormLoggingCounter++; |     NormLoggingCounter++; | ||||||
|   } |   } | ||||||
|  |   return true; | ||||||
| } | } | ||||||
| void FlightRecorder::CsumLog(uint64_t hex) | bool FlightRecorder::CsumLog(uint64_t hex) | ||||||
| { | { | ||||||
|   if(LoggingMode == LoggingModePrint) { |   if(LoggingMode == LoggingModePrint) { | ||||||
|     std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; |     std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; | ||||||
|     CsumLoggingCounter++; |     CsumLoggingCounter++; | ||||||
|  |     return true; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if(LoggingMode == LoggingModeRecord) { |   if(LoggingMode == LoggingModeRecord) { | ||||||
|     std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; |     std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; | ||||||
|     CsumLogVector.push_back(hex); |     CsumLogVector.push_back(hex); | ||||||
|     CsumLoggingCounter++; |     CsumLoggingCounter++; | ||||||
|  |     return true; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if(LoggingMode == LoggingModeVerify) { |   if(LoggingMode == LoggingModeVerify) { | ||||||
| @@ -181,6 +201,9 @@ void FlightRecorder::CsumLog(uint64_t hex) | |||||||
|  |  | ||||||
|       if ( hex != hexref ) { |       if ( hex != hexref ) { | ||||||
|  |  | ||||||
|  | 	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n", | ||||||
|  | 		FlightRecorder::StepLoggingCounter, | ||||||
|  | 		FlightRecorder::StepName); | ||||||
|         std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter |         std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter | ||||||
| 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl; | 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl; | ||||||
|  |  | ||||||
| @@ -188,9 +211,10 @@ void FlightRecorder::CsumLog(uint64_t hex) | |||||||
| 		GridHostname(), | 		GridHostname(), | ||||||
| 		GlobalSharedMemory::WorldShmRank, | 		GlobalSharedMemory::WorldShmRank, | ||||||
| 		CsumLoggingCounter,hex, hexref); | 		CsumLoggingCounter,hex, hexref); | ||||||
|  | 	BACKTRACEFP(stderr); | ||||||
| 	fflush(stderr); | 	fflush(stderr); | ||||||
|  |  | ||||||
| 	if(!ContinueOnFail) assert(0); // Force takedown of job | 	if(!ContinueOnFail) return false; | ||||||
| 	   | 	   | ||||||
| 	ErrorCounter++; | 	ErrorCounter++; | ||||||
|  |  | ||||||
| @@ -207,7 +231,9 @@ void FlightRecorder::CsumLog(uint64_t hex) | |||||||
|     } |     } | ||||||
|     CsumLoggingCounter++; |     CsumLoggingCounter++; | ||||||
|   } |   } | ||||||
|  |   return true; | ||||||
| } | } | ||||||
|  |  | ||||||
| void FlightRecorder::ReductionLog(double local,double global) | void FlightRecorder::ReductionLog(double local,double global) | ||||||
| { | { | ||||||
|   uint64_t hex_l = * ( (uint64_t *)&local ); |   uint64_t hex_l = * ( (uint64_t *)&local ); | ||||||
| @@ -224,11 +250,15 @@ void FlightRecorder::ReductionLog(double local,double global) | |||||||
|   if(LoggingMode == LoggingModeVerify) { |   if(LoggingMode == LoggingModeVerify) { | ||||||
|     if(ReductionLoggingCounter < ReductionLogVector.size()){ |     if(ReductionLoggingCounter < ReductionLogVector.size()){ | ||||||
|       if ( global != ReductionLogVector[ReductionLoggingCounter] ) { |       if ( global != ReductionLogVector[ReductionLoggingCounter] ) { | ||||||
|  | 	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n", | ||||||
|  | 		FlightRecorder::StepLoggingCounter, | ||||||
|  | 		FlightRecorder::StepName); | ||||||
| 	fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n", | 	fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n", | ||||||
| 		GridHostname(), | 		GridHostname(), | ||||||
| 		GlobalSharedMemory::WorldShmRank, | 		GlobalSharedMemory::WorldShmRank, | ||||||
| 		ReductionLoggingCounter,ReductionLogVector.size(), | 		ReductionLoggingCounter,ReductionLogVector.size(), | ||||||
| 		global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr); | 		global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr); | ||||||
|  | 	BACKTRACEFP(stderr); | ||||||
| 	 | 	 | ||||||
| 	if ( !ContinueOnFail ) assert(0); | 	if ( !ContinueOnFail ) assert(0); | ||||||
|  |  | ||||||
| @@ -250,10 +280,11 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes) | |||||||
|   if(LoggingMode == LoggingModeNone) return; |   if(LoggingMode == LoggingModeNone) return; | ||||||
|  |  | ||||||
|   if ( ChecksumCommsSend ){ |   if ( ChecksumCommsSend ){ | ||||||
|   uint64_t *ubuf = (uint64_t *)buf; |  | ||||||
|   if(LoggingMode == LoggingModeNone) return; |     if(LoggingMode == LoggingModeNone) return; | ||||||
|    |    | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|  |   uint64_t *ubuf = (uint64_t *)buf; | ||||||
|   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); |   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); | ||||||
|   if(LoggingMode == LoggingModePrint) { |   if(LoggingMode == LoggingModePrint) { | ||||||
|     std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; |     std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; | ||||||
| @@ -267,11 +298,15 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes) | |||||||
|   if(LoggingMode == LoggingModeVerify) { |   if(LoggingMode == LoggingModeVerify) { | ||||||
|     if(XmitLoggingCounter < XmitLogVector.size()){ |     if(XmitLoggingCounter < XmitLogVector.size()){ | ||||||
|       if ( _xor != XmitLogVector[XmitLoggingCounter] ) { |       if ( _xor != XmitLogVector[XmitLoggingCounter] ) { | ||||||
|  | 	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n", | ||||||
|  | 		FlightRecorder::StepLoggingCounter, | ||||||
|  | 		FlightRecorder::StepName); | ||||||
| 	fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n", | 	fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n", | ||||||
| 		GridHostname(), | 		GridHostname(), | ||||||
| 		GlobalSharedMemory::WorldShmRank, | 		GlobalSharedMemory::WorldShmRank, | ||||||
| 		XmitLoggingCounter,XmitLogVector.size(), | 		XmitLoggingCounter,XmitLogVector.size(), | ||||||
| 		_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr); | 		_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr); | ||||||
|  | 	BACKTRACEFP(stderr); | ||||||
| 	 | 	 | ||||||
| 	if ( !ContinueOnFail ) assert(0); | 	if ( !ContinueOnFail ) assert(0); | ||||||
|  |  | ||||||
| @@ -293,9 +328,9 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes) | |||||||
| void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) | void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) | ||||||
| { | { | ||||||
|   if ( ChecksumComms ){ |   if ( ChecksumComms ){ | ||||||
|   uint64_t *ubuf = (uint64_t *)buf; |  | ||||||
|   if(LoggingMode == LoggingModeNone) return; |   if(LoggingMode == LoggingModeNone) return; | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
|  |   uint64_t *ubuf = (uint64_t *)buf; | ||||||
|   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); |   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); | ||||||
|   if(LoggingMode == LoggingModePrint) { |   if(LoggingMode == LoggingModePrint) { | ||||||
|     std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; |     std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; | ||||||
| @@ -309,11 +344,15 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) | |||||||
|   if(LoggingMode == LoggingModeVerify) { |   if(LoggingMode == LoggingModeVerify) { | ||||||
|     if(RecvLoggingCounter < RecvLogVector.size()){ |     if(RecvLoggingCounter < RecvLogVector.size()){ | ||||||
|       if ( _xor != RecvLogVector[RecvLoggingCounter] ) { |       if ( _xor != RecvLogVector[RecvLoggingCounter] ) { | ||||||
|  | 	fprintf(stderr,"FlightRecorder Oops step %d stage %s \n", | ||||||
|  | 		FlightRecorder::StepLoggingCounter, | ||||||
|  | 		FlightRecorder::StepName); | ||||||
| 	fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n", | 	fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n", | ||||||
| 		GridHostname(), | 		GridHostname(), | ||||||
| 		GlobalSharedMemory::WorldShmRank, | 		GlobalSharedMemory::WorldShmRank, | ||||||
| 		RecvLoggingCounter,RecvLogVector.size(), | 		RecvLoggingCounter,RecvLogVector.size(), | ||||||
| 		_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr); | 		_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr); | ||||||
|  | 	BACKTRACEFP(stderr); | ||||||
| 	 | 	 | ||||||
| 	if ( !ContinueOnFail ) assert(0); | 	if ( !ContinueOnFail ) assert(0); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -12,6 +12,8 @@ class FlightRecorder { | |||||||
|    |    | ||||||
|   static int                   LoggingMode; |   static int                   LoggingMode; | ||||||
|   static uint64_t              ErrorCounter; |   static uint64_t              ErrorCounter; | ||||||
|  |   static const char *                StepName; | ||||||
|  |   static int32_t               StepLoggingCounter; | ||||||
|   static int32_t               XmitLoggingCounter; |   static int32_t               XmitLoggingCounter; | ||||||
|   static int32_t               RecvLoggingCounter; |   static int32_t               RecvLoggingCounter; | ||||||
|   static int32_t               CsumLoggingCounter; |   static int32_t               CsumLoggingCounter; | ||||||
| @@ -30,8 +32,9 @@ class FlightRecorder { | |||||||
|   static void SetLoggingModeRecord(void); |   static void SetLoggingModeRecord(void); | ||||||
|   static void SetLoggingModeVerify(void); |   static void SetLoggingModeVerify(void); | ||||||
|   static void SetLoggingMode(LoggingMode_t mode); |   static void SetLoggingMode(LoggingMode_t mode); | ||||||
|   static void NormLog(double value); |   static bool StepLog(const char *name); | ||||||
|   static void CsumLog(uint64_t csum); |   static bool NormLog(double value); | ||||||
|  |   static bool CsumLog(uint64_t csum); | ||||||
|   static void ReductionLog(double lcl, double glbl); |   static void ReductionLog(double lcl, double glbl); | ||||||
|   static void Truncate(void); |   static void Truncate(void); | ||||||
|   static void ResetCounters(void); |   static void ResetCounters(void); | ||||||
|   | |||||||
| @@ -464,16 +464,12 @@ void Grid_init(int *argc,char ***argv) | |||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"Performance:"<<std::endl; |     std::cout<<GridLogMessage<<"Performance:"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;     |  | ||||||
|     std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;     |  | ||||||
|     std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;     |     std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;     |     std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;     |     std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<"  --dslash-asm    : Wilson kernel for AVX512"<<std::endl;     |     std::cout<<GridLogMessage<<"  --dslash-asm    : Wilson kernel for AVX512"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --lebesgue      : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;     |  | ||||||
|     std::cout<<GridLogMessage<<"  --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;     |  | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     exit(EXIT_SUCCESS); |     exit(EXIT_SUCCESS); | ||||||
|   } |   } | ||||||
| @@ -501,28 +497,8 @@ void Grid_init(int *argc,char ***argv) | |||||||
|     WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute; |     WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute; | ||||||
|     StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsThenCompute; |     StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsThenCompute; | ||||||
|   } |   } | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){ |  | ||||||
|     CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent); |  | ||||||
|   } |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){ |  | ||||||
|     CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ |  | ||||||
|     LebesgueOrder::UseLebesgueOrder=1; |  | ||||||
|   } |  | ||||||
|   CartesianCommunicator::nCommThreads = 1; |   CartesianCommunicator::nCommThreads = 1; | ||||||
| #ifdef GRID_COMMS_THREADS   |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){ |  | ||||||
|     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads"); |  | ||||||
|     GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads); |  | ||||||
|     assert(CartesianCommunicator::nCommThreads > 0); |  | ||||||
|   } |  | ||||||
| #endif   |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ |  | ||||||
|     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); |  | ||||||
|     GridCmdOptionIntVector(arg,LebesgueOrder::Block); |  | ||||||
|   } |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){ | ||||||
|     GridLogTimestamp(0); |     GridLogTimestamp(0); | ||||||
|   } else { |   } else { | ||||||
| @@ -573,8 +549,34 @@ void GridLogLayout() { | |||||||
|  |  | ||||||
| void * Grid_backtrace_buffer[_NBACKTRACE]; | void * Grid_backtrace_buffer[_NBACKTRACE]; | ||||||
|  |  | ||||||
|  | void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr) | ||||||
|  | { | ||||||
|  |   fprintf(stderr,"Signal handler on host %s\n",hostname); | ||||||
|  |   fprintf(stderr,"FlightRecorder step %d stage %s \n", | ||||||
|  | 	  FlightRecorder::StepLoggingCounter, | ||||||
|  | 	  FlightRecorder::StepName); | ||||||
|  |   fprintf(stderr,"Caught signal %d\n",si->si_signo); | ||||||
|  |   fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr); | ||||||
|  |   fprintf(stderr,"         code %d\n",si->si_code); | ||||||
|  |   // x86 64bit | ||||||
|  | #ifdef __linux__ | ||||||
|  | #ifdef __x86_64__ | ||||||
|  |   ucontext_t * uc= (ucontext_t *)ptr; | ||||||
|  |   struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; | ||||||
|  |   fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip); | ||||||
|  | #endif | ||||||
|  | #endif | ||||||
|  |   fflush(stderr); | ||||||
|  |   BACKTRACEFP(stderr); | ||||||
|  |   fprintf(stderr,"Called backtrace\n"); | ||||||
|  |   fflush(stdout); | ||||||
|  |   fflush(stderr); | ||||||
|  |   return; | ||||||
|  | } | ||||||
|  |  | ||||||
| void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) | void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) | ||||||
| { | { | ||||||
|  |   fprintf(stderr,"Signal handler on host %s\n",hostname); | ||||||
|   fprintf(stderr,"Caught signal %d\n",si->si_signo); |   fprintf(stderr,"Caught signal %d\n",si->si_signo); | ||||||
|   fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr); |   fprintf(stderr,"  mem address %llx\n",(unsigned long long)si->si_addr); | ||||||
|   fprintf(stderr,"         code %d\n",si->si_code); |   fprintf(stderr,"         code %d\n",si->si_code); | ||||||
| @@ -585,7 +587,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) | |||||||
|   ucontext_t * uc= (ucontext_t *)ptr; |   ucontext_t * uc= (ucontext_t *)ptr; | ||||||
|   struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; |   struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext; | ||||||
|   fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip); |   fprintf(stderr,"  instruction %llx\n",(unsigned long long)sc->rip); | ||||||
| #define REG(A)  printf("  %s %lx\n",#A,sc-> A); | #define REG(A)  fprintf(stderr,"  %s %lx\n",#A,sc-> A); | ||||||
|   REG(rdi); |   REG(rdi); | ||||||
|   REG(rsi); |   REG(rsi); | ||||||
|   REG(rbp); |   REG(rbp); | ||||||
| @@ -618,8 +620,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr) | |||||||
|  |  | ||||||
| void Grid_exit_handler(void) | void Grid_exit_handler(void) | ||||||
| { | { | ||||||
|   BACKTRACEFP(stdout); |   //  BACKTRACEFP(stdout); | ||||||
|   fflush(stdout); |   //  fflush(stdout); | ||||||
| } | } | ||||||
| void Grid_debug_handler_init(void) | void Grid_debug_handler_init(void) | ||||||
| { | { | ||||||
| @@ -627,10 +629,10 @@ void Grid_debug_handler_init(void) | |||||||
|   sigemptyset (&sa.sa_mask); |   sigemptyset (&sa.sa_mask); | ||||||
|   sa.sa_sigaction= Grid_sa_signal_handler; |   sa.sa_sigaction= Grid_sa_signal_handler; | ||||||
|   sa.sa_flags    = SA_SIGINFO; |   sa.sa_flags    = SA_SIGINFO; | ||||||
|   sigaction(SIGSEGV,&sa,NULL); |   //  sigaction(SIGSEGV,&sa,NULL); | ||||||
|   sigaction(SIGTRAP,&sa,NULL); |   sigaction(SIGTRAP,&sa,NULL); | ||||||
|   sigaction(SIGBUS,&sa,NULL); |   sigaction(SIGBUS,&sa,NULL); | ||||||
|   sigaction(SIGUSR2,&sa,NULL); |   //  sigaction(SIGUSR2,&sa,NULL); | ||||||
|  |  | ||||||
|   feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO); |   feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO); | ||||||
|  |  | ||||||
| @@ -638,7 +640,14 @@ void Grid_debug_handler_init(void) | |||||||
|   sigaction(SIGKILL,&sa,NULL); |   sigaction(SIGKILL,&sa,NULL); | ||||||
|   sigaction(SIGILL,&sa,NULL); |   sigaction(SIGILL,&sa,NULL); | ||||||
|  |  | ||||||
|   atexit(Grid_exit_handler); |   // Non terminating SIGUSR1/2 handler | ||||||
|  |   struct sigaction sa_ping; | ||||||
|  |   sigemptyset (&sa_ping.sa_mask); | ||||||
|  |   sa_ping.sa_sigaction= Grid_usr_signal_handler; | ||||||
|  |   sa_ping.sa_flags    = SA_SIGINFO; | ||||||
|  |   sigaction(SIGHUP,&sa_ping,NULL); | ||||||
|  |  | ||||||
|  |   //  atexit(Grid_exit_handler); | ||||||
| } | } | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|   | |||||||
| @@ -1,5 +1,5 @@ | |||||||
| # additional include paths necessary to compile the C++ library | # additional include paths necessary to compile the C++ library | ||||||
| SUBDIRS = Grid HMC benchmarks tests examples | SUBDIRS = Grid  benchmarks tests examples HMC | ||||||
|  |  | ||||||
| include $(top_srcdir)/doxygen.inc | include $(top_srcdir)/doxygen.inc | ||||||
|  |  | ||||||
|   | |||||||
| @@ -644,11 +644,6 @@ int main (int argc, char ** argv) | |||||||
|   Grid_init(&argc,&argv); |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|   CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); |   CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); | ||||||
| #ifdef KNL |  | ||||||
|   LebesgueOrder::Block = std::vector<int>({8,2,2,2}); |  | ||||||
| #else |  | ||||||
|   LebesgueOrder::Block = std::vector<int>({2,2,2,2}); |  | ||||||
| #endif |  | ||||||
|   Benchmark::Decomposition(); |   Benchmark::Decomposition(); | ||||||
|  |  | ||||||
|   int do_su4=1; |   int do_su4=1; | ||||||
|   | |||||||
| @@ -70,7 +70,7 @@ int main (int argc, char ** argv) | |||||||
|     pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101})); |     pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101})); | ||||||
|  |  | ||||||
|     std::vector<double> stop(threads); |     std::vector<double> stop(threads); | ||||||
|     Vector<Vec> sum(threads); |     std::vector<Vec> sum(threads); | ||||||
|  |  | ||||||
|     std::vector<LatticeVec> x(threads,&Grid); |     std::vector<LatticeVec> x(threads,&Grid); | ||||||
|     for(int t=0;t<threads;t++){ |     for(int t=0;t<threads;t++){ | ||||||
|   | |||||||
| @@ -78,9 +78,9 @@ int main (int argc, char ** argv) | |||||||
|     double t0,t1; |     double t0,t1; | ||||||
|      |      | ||||||
|     typedef typename DomainWallFermionD::Coeff_t Coeff_t; |     typedef typename DomainWallFermionD::Coeff_t Coeff_t; | ||||||
|     Vector<Coeff_t> diag = Dw.bs; |     std::vector<Coeff_t> diag = Dw.bs; | ||||||
|     Vector<Coeff_t> upper= Dw.cs; |     std::vector<Coeff_t> upper= Dw.cs; | ||||||
|     Vector<Coeff_t> lower= Dw.cs; |     std::vector<Coeff_t> lower= Dw.cs; | ||||||
|     upper[Ls-1]=-Dw.mass_minus*upper[Ls-1]; |     upper[Ls-1]=-Dw.mass_minus*upper[Ls-1]; | ||||||
|     lower[0]   =-Dw.mass_plus*lower[0]; |     lower[0]   =-Dw.mass_plus*lower[0]; | ||||||
|      |      | ||||||
|   | |||||||
| @@ -118,7 +118,7 @@ public: | |||||||
|     fprintf(FP,"Packet bytes, direction, GB/s per node\n"); |     fprintf(FP,"Packet bytes, direction, GB/s per node\n"); | ||||||
|     for(int lat=16;lat<=maxlat;lat+=8){ |     for(int lat=16;lat<=maxlat;lat+=8){ | ||||||
|       //      for(int Ls=8;Ls<=8;Ls*=2){ |       //      for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|       { int Ls=12; |       { int Ls=8; | ||||||
|  |  | ||||||
| 	Coordinate latt_size  ({lat*mpi_layout[0], | 	Coordinate latt_size  ({lat*mpi_layout[0], | ||||||
| 	      lat*mpi_layout[1], | 	      lat*mpi_layout[1], | ||||||
| @@ -861,7 +861,7 @@ int main (int argc, char ** argv) | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); |   CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); | ||||||
|   LebesgueOrder::Block = std::vector<int>({2,2,2,2}); |   //  LebesgueOrder::Block = std::vector<int>({2,2,2,2}); | ||||||
|  |  | ||||||
|   Benchmark::Decomposition(); |   Benchmark::Decomposition(); | ||||||
|  |  | ||||||
| @@ -872,7 +872,7 @@ int main (int argc, char ** argv) | |||||||
|   int do_dslash=1; |   int do_dslash=1; | ||||||
|  |  | ||||||
|   int sel=4; |   int sel=4; | ||||||
|   std::vector<int> L_list({8,12,16,24,32}); |   std::vector<int> L_list({8,12,16,24}); | ||||||
|   int selm1=sel-1; |   int selm1=sel-1; | ||||||
|  |  | ||||||
|   std::vector<double> clover; |   std::vector<double> clover; | ||||||
|   | |||||||
							
								
								
									
										33
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										33
									
								
								configure.ac
									
									
									
									
									
								
							| @@ -72,6 +72,7 @@ AC_CHECK_HEADERS(malloc/malloc.h) | |||||||
| AC_CHECK_HEADERS(malloc.h) | AC_CHECK_HEADERS(malloc.h) | ||||||
| AC_CHECK_HEADERS(endian.h) | AC_CHECK_HEADERS(endian.h) | ||||||
| AC_CHECK_HEADERS(execinfo.h) | AC_CHECK_HEADERS(execinfo.h) | ||||||
|  | AC_CHECK_HEADERS(numaif.h) | ||||||
| AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]]) | AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]]) | ||||||
| AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]]) | AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]]) | ||||||
|  |  | ||||||
| @@ -128,6 +129,20 @@ case ${ac_LAPACK} in | |||||||
|         AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; |         AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);; | ||||||
| esac | esac | ||||||
|  |  | ||||||
|  | ############### internal reduction | ||||||
|  | AC_ARG_ENABLE([reduction], | ||||||
|  |     [AS_HELP_STRING([--enable-reduction=mpi|grid],[enable reduction])], | ||||||
|  |     [ac_REDUCTION=${enable_reduction}], [ac_REDUCTION=grid]) | ||||||
|  |  | ||||||
|  | case ${ac_REDUCTION} in | ||||||
|  |     mpi) | ||||||
|  |         ;; | ||||||
|  |     grid) | ||||||
|  |         AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);; | ||||||
|  |     *) | ||||||
|  |         AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);; | ||||||
|  | esac | ||||||
|  |  | ||||||
| ############### tracing | ############### tracing | ||||||
| AC_ARG_ENABLE([tracing], | AC_ARG_ENABLE([tracing], | ||||||
|     [AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])], |     [AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])], | ||||||
| @@ -225,19 +240,21 @@ case ${ac_SFW_FP16} in | |||||||
|       AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);; |       AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);; | ||||||
| esac | esac | ||||||
|  |  | ||||||
| ############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons |  | ||||||
|  | ############### MPI BOUNCE TO HOST | ||||||
| AC_ARG_ENABLE([accelerator-aware-mpi], | AC_ARG_ENABLE([accelerator-aware-mpi], | ||||||
|     [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])], |     [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])], | ||||||
|     [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes]) |     [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes]) | ||||||
|  |  | ||||||
|  | # Force accelerator CSHIFT now | ||||||
|  | AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device]) | ||||||
|  |  | ||||||
| case ${ac_ACCELERATOR_AWARE_MPI} in | case ${ac_ACCELERATOR_AWARE_MPI} in | ||||||
|     yes) |     yes) | ||||||
|       AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host]) |  | ||||||
|       AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);; |       AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);; | ||||||
|     *);; |     *);; | ||||||
| esac | esac | ||||||
|  |  | ||||||
|  |  | ||||||
| ############### SYCL/CUDA/HIP/none | ############### SYCL/CUDA/HIP/none | ||||||
| AC_ARG_ENABLE([accelerator], | AC_ARG_ENABLE([accelerator], | ||||||
|     [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])], |     [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])], | ||||||
| @@ -664,16 +681,6 @@ case ${ac_SHM_FAST_PATH} in | |||||||
|      *) ;; |      *) ;; | ||||||
| esac | esac | ||||||
|  |  | ||||||
| ############### communication type selection |  | ||||||
| AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes]) |  | ||||||
|  |  | ||||||
| case ${ac_COMMS_THREADS} in |  | ||||||
|      yes) |  | ||||||
|         AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] ) |  | ||||||
|       ;; |  | ||||||
|      *) ;; |  | ||||||
| esac |  | ||||||
|  |  | ||||||
| ############### communication type selection | ############### communication type selection | ||||||
| AC_ARG_ENABLE([comms],[AS_HELP_STRING([--enable-comms=none|mpi|mpi-auto],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none]) | AC_ARG_ENABLE([comms],[AS_HELP_STRING([--enable-comms=none|mpi|mpi-auto],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none]) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,383 +0,0 @@ | |||||||
| /* |  | ||||||
|  * Warning: This code illustrative only: not well tested, and not meant for production use |  | ||||||
|  * without regression / tests being applied |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #include <Grid/Grid.h> |  | ||||||
|  |  | ||||||
| using namespace std; |  | ||||||
| using namespace Grid; |  | ||||||
|  |  | ||||||
| RealD LLscale =1.0; |  | ||||||
| RealD LCscale =1.0; |  | ||||||
|  |  | ||||||
| template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field> |  | ||||||
| { |  | ||||||
| public: |  | ||||||
|   INHERIT_GIMPL_TYPES(Gimpl); |  | ||||||
|  |  | ||||||
|   GridBase *grid; |  | ||||||
|   GaugeField U; |  | ||||||
|    |  | ||||||
|   CovariantLaplacianCshift(GaugeField &_U)    : |  | ||||||
|     grid(_U.Grid()), |  | ||||||
|     U(_U) {  }; |  | ||||||
|  |  | ||||||
|   virtual GridBase *Grid(void) { return grid; }; |  | ||||||
|  |  | ||||||
|   virtual void  M    (const Field &in, Field &out) |  | ||||||
|   { |  | ||||||
|     out=Zero(); |  | ||||||
|     for(int mu=0;mu<Nd-1;mu++) { |  | ||||||
|       GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent |  | ||||||
|       out = out - Gimpl::CovShiftForward(Umu,mu,in);     |  | ||||||
|       out = out - Gimpl::CovShiftBackward(Umu,mu,in);     |  | ||||||
|       out = out + 2.0*in; |  | ||||||
|     } |  | ||||||
|   }; |  | ||||||
|   virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian |  | ||||||
|   virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid |  | ||||||
|   virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid |  | ||||||
|   virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| void MakePhase(Coordinate mom,LatticeComplex &phase) |  | ||||||
| { |  | ||||||
|   GridBase *grid = phase.Grid(); |  | ||||||
|   auto latt_size = grid->GlobalDimensions(); |  | ||||||
|   ComplexD ci(0.0,1.0); |  | ||||||
|   phase=Zero(); |  | ||||||
|  |  | ||||||
|   LatticeComplex coor(phase.Grid()); |  | ||||||
|   for(int mu=0;mu<Nd;mu++){ |  | ||||||
|     RealD TwoPiL =  M_PI * 2.0/ latt_size[mu]; |  | ||||||
|     LatticeCoordinate(coor,mu); |  | ||||||
|     phase = phase + (TwoPiL * mom[mu]) * coor; |  | ||||||
|   } |  | ||||||
|   phase = exp(phase*ci); |  | ||||||
| } |  | ||||||
| void PointSource(Coordinate &coor,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   //  Coordinate coor({0,0,0,0}); |  | ||||||
|   source=Zero(); |  | ||||||
|   SpinColourMatrix kronecker; kronecker=1.0; |  | ||||||
|   pokeSite(kronecker,source,coor); |  | ||||||
| } |  | ||||||
| void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   GridBase *grid = source.Grid(); |  | ||||||
|   LatticeComplex noise(grid); |  | ||||||
|   LatticeComplex zz(grid); zz=Zero(); |  | ||||||
|   LatticeInteger t(grid); |  | ||||||
|  |  | ||||||
|   RealD nrm=1.0/sqrt(2); |  | ||||||
|   bernoulli(RNG, noise); // 0,1 50:50 |  | ||||||
|  |  | ||||||
|   noise = (2.*noise - Complex(1,1))*nrm; |  | ||||||
|  |  | ||||||
|   LatticeCoordinate(t,Tdir); |  | ||||||
|   noise = where(t==Integer(tslice), noise, zz); |  | ||||||
|  |  | ||||||
|   source = 1.0; |  | ||||||
|   source = source*noise; |  | ||||||
|   std::cout << " Z2 wall " << norm2(source) << std::endl; |  | ||||||
| } |  | ||||||
| template<class Field> |  | ||||||
| void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) |  | ||||||
| { |  | ||||||
|   typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t; |  | ||||||
|   Laplacian_t Laplacian(U); |  | ||||||
|  |  | ||||||
|   Integer Iterations = 40; |  | ||||||
|   Real width = 2.0; |  | ||||||
|   Real coeff = (width*width) / Real(4*Iterations); |  | ||||||
|  |  | ||||||
|   Field tmp(U.Grid()); |  | ||||||
|   smeared=unsmeared; |  | ||||||
|   //  chi = (1-p^2/2N)^N kronecker |  | ||||||
|   for(int n = 0; n < Iterations; ++n) { |  | ||||||
|     Laplacian.M(smeared,tmp); |  | ||||||
|     smeared = smeared - coeff*tmp; |  | ||||||
|     std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   LatticePropagator tmp(source.Grid()); |  | ||||||
|   PointSource(site,source); |  | ||||||
|   std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl; |  | ||||||
|   tmp = source; |  | ||||||
|   GaussianSmear(U,tmp,source); |  | ||||||
|   std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl; |  | ||||||
| } |  | ||||||
| void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   Z2WallSource(RNG,tslice,source); |  | ||||||
|   auto tmp = source; |  | ||||||
|   GaussianSmear(U,tmp,source); |  | ||||||
| } |  | ||||||
| void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   assert(mom.size()==Nd); |  | ||||||
|   assert(mom[Tdir] == 0); |  | ||||||
|  |  | ||||||
|   GridBase * grid = spectator.Grid(); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   LatticeInteger ts(grid); |  | ||||||
|   LatticeCoordinate(ts,Tdir); |  | ||||||
|   source = Zero(); |  | ||||||
|   source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else |  | ||||||
|  |  | ||||||
|   LatticeComplex phase(grid); |  | ||||||
|   MakePhase(mom,phase); |  | ||||||
|  |  | ||||||
|   source = source *phase; |  | ||||||
| } |  | ||||||
| template<class Action> |  | ||||||
| void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) |  | ||||||
| { |  | ||||||
|   GridBase *UGrid = D.GaugeGrid(); |  | ||||||
|   GridBase *FGrid = D.FermionGrid(); |  | ||||||
|  |  | ||||||
|   LatticeFermion src4  (UGrid);  |  | ||||||
|   LatticeFermion src5  (FGrid);  |  | ||||||
|   LatticeFermion result5(FGrid); |  | ||||||
|   LatticeFermion result4(UGrid); |  | ||||||
|   LatticePropagator prop5(FGrid); |  | ||||||
|    |  | ||||||
|   ConjugateGradient<LatticeFermion> CG(1.0e-8,100000); |  | ||||||
|   SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG); |  | ||||||
|   ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors |  | ||||||
|    for(int s=0;s<Nd;s++){ |  | ||||||
|     for(int c=0;c<Nc;c++){ |  | ||||||
|       PropToFerm<Action>(src4,source,s,c); |  | ||||||
|  |  | ||||||
|       D.ImportPhysicalFermionSource(src4,src5); |  | ||||||
|  |  | ||||||
|       result5=Zero(); |  | ||||||
|       schur(D,src5,result5,ZG); |  | ||||||
|       std::cout<<GridLogMessage |  | ||||||
| 	       <<"spin "<<s<<" color "<<c |  | ||||||
| 	       <<" norm2(src5d) "   <<norm2(src5) |  | ||||||
|                <<" norm2(result5d) "<<norm2(result5)<<std::endl; |  | ||||||
|  |  | ||||||
|       D.ExportPhysicalFermionSolution(result5,result4); |  | ||||||
|  |  | ||||||
|       FermToProp<Action>(prop5,result5,s,c); |  | ||||||
|       FermToProp<Action>(propagator,result4,s,c); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   LatticePropagator Axial_mu(UGrid);  |  | ||||||
|   LatticePropagator Vector_mu(UGrid);  |  | ||||||
|  |  | ||||||
|   LatticeComplex    PA (UGrid);  |  | ||||||
|   LatticeComplex    VV (UGrid);  |  | ||||||
|   LatticeComplex    PJ5q(UGrid); |  | ||||||
|   LatticeComplex    PP (UGrid); |  | ||||||
|  |  | ||||||
|   std::vector<TComplex> sumPA; |  | ||||||
|   std::vector<TComplex> sumVV; |  | ||||||
|   std::vector<TComplex> sumPP; |  | ||||||
|   std::vector<TComplex> sumPJ5q; |  | ||||||
|  |  | ||||||
|   Gamma g5(Gamma::Algebra::Gamma5); |  | ||||||
|   D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir); |  | ||||||
|   PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current |  | ||||||
|   sliceSum(PA,sumPA,Tdir); |  | ||||||
|  |  | ||||||
|   int Nt{static_cast<int>(sumPA.size())}; |  | ||||||
|  |  | ||||||
|   for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl; |  | ||||||
|  |  | ||||||
|   PP       = trace(adj(propagator)*propagator); // Pseudoscalar density |  | ||||||
|   sliceSum(PP,sumPP,Tdir); |  | ||||||
|   for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl; |  | ||||||
|    |  | ||||||
|   D.ContractJ5q(prop5,PJ5q); |  | ||||||
|   sliceSum(PJ5q,sumPJ5q,Tdir); |  | ||||||
|   for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl; |  | ||||||
|  |  | ||||||
|   Gamma::Algebra GammaV[3] = { |  | ||||||
|     Gamma::Algebra::GammaX, |  | ||||||
|     Gamma::Algebra::GammaY, |  | ||||||
|     Gamma::Algebra::GammaZ |  | ||||||
|   }; |  | ||||||
|   for( int mu=0;mu<3;mu++ ) { |  | ||||||
|     Gamma gV(GammaV[mu]); |  | ||||||
|     D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu); |  | ||||||
|     //    auto ss=sliceSum(Vector_mu,Tdir); |  | ||||||
|     //    for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl; |  | ||||||
|     VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current |  | ||||||
|     sliceSum(VV,sumVV,Tdir); |  | ||||||
|     for(int t=0;t<Nt;t++){ |  | ||||||
|       RealD Ct = real(TensorRemove(sumVV[t]))*LCscale; |  | ||||||
|       std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct |  | ||||||
| 	       << " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| class MesonFile: Serializable { |  | ||||||
| public: |  | ||||||
|   GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data); |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) |  | ||||||
| { |  | ||||||
|   const int nchannel=3; |  | ||||||
|   Gamma::Algebra Gammas[nchannel][2] = { |  | ||||||
|     {Gamma::Algebra::GammaX,Gamma::Algebra::GammaX}, |  | ||||||
|     {Gamma::Algebra::GammaY,Gamma::Algebra::GammaY}, |  | ||||||
|     {Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ} |  | ||||||
|   }; |  | ||||||
|  |  | ||||||
|   Gamma G5(Gamma::Algebra::Gamma5); |  | ||||||
|  |  | ||||||
|   LatticeComplex meson_CF(q1.Grid()); |  | ||||||
|   MesonFile MF; |  | ||||||
|  |  | ||||||
|   for(int ch=0;ch<nchannel;ch++){ |  | ||||||
|  |  | ||||||
|     Gamma Gsrc(Gammas[ch][0]); |  | ||||||
|     Gamma Gsnk(Gammas[ch][1]); |  | ||||||
|  |  | ||||||
|     meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc)); |  | ||||||
|  |  | ||||||
|     std::vector<TComplex> meson_T; |  | ||||||
|     sliceSum(meson_CF,meson_T, Tdir); |  | ||||||
|  |  | ||||||
|     int nt=meson_T.size(); |  | ||||||
|  |  | ||||||
|     std::vector<Complex> corr(nt); |  | ||||||
|     for(int t=0;t<nt;t++){ |  | ||||||
|       corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around |  | ||||||
|       std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *real(corr[t])<<std::endl; |  | ||||||
|     } |  | ||||||
|     MF.data.push_back(corr); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   { |  | ||||||
|     XmlWriter WR(file); |  | ||||||
|     write(WR,"MesonFile",MF); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| int main (int argc, char ** argv) |  | ||||||
| { |  | ||||||
|   const int Ls=32; |  | ||||||
|  |  | ||||||
|   Grid_init(&argc,&argv); |  | ||||||
|  |  | ||||||
|   // Double precision grids |  | ||||||
|   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),  |  | ||||||
| 								   GridDefaultSimd(Nd,vComplex::Nsimd()), |  | ||||||
| 								   GridDefaultMpi()); |  | ||||||
|   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); |  | ||||||
|   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); |  | ||||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); |  | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////////////////////////////// |  | ||||||
|   // You can manage seeds however you like. |  | ||||||
|   // Recommend SeedUniqueString. |  | ||||||
|   ////////////////////////////////////////////////////////////////////// |  | ||||||
|   std::vector<int> seeds4({1,2,3,4});  |  | ||||||
|   GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); |  | ||||||
|  |  | ||||||
|   LatticeGaugeField Umu(UGrid); |  | ||||||
|   std::string config; |  | ||||||
|   RealD M5=1.8; |  | ||||||
|   if( argc > 1 && argv[1][0] != '-' ) |  | ||||||
|   { |  | ||||||
|     std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl; |  | ||||||
|     FieldMetaData header; |  | ||||||
|     NerscIO::readConfiguration(Umu, header, argv[1]); |  | ||||||
|     config=argv[1]; |  | ||||||
|     M5=1.8; |  | ||||||
|   } |  | ||||||
|   else |  | ||||||
|   { |  | ||||||
|     SU<Nc>::ColdConfiguration(Umu); |  | ||||||
|     config="ColdConfig"; |  | ||||||
|     //    RealD P=1.0; // Don't scale |  | ||||||
|     RealD P=0.5871119; // 48I |  | ||||||
|     //    RealD P=0.6153342; // 64I |  | ||||||
|     //    RealD P=0.6388238 // 32Ifine |  | ||||||
|     RealD u0 = sqrt(sqrt(P)); |  | ||||||
|     RealD M5mf = M5 - 4.0*(1.0-u0); |  | ||||||
|     RealD w0   = 1.0 - M5mf; |  | ||||||
| #if 0 |  | ||||||
|     // M5=1.8 with U=u0 |  | ||||||
|     Umu = Umu * u0; |  | ||||||
|     LLscale = 1.0; |  | ||||||
|     LCscale = 1.0; |  | ||||||
|     std::cout<<GridLogMessage <<"Gauge links are u=u0= "<<u0<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl; |  | ||||||
| #else |  | ||||||
|     M5 = M5mf; |  | ||||||
|     std::cout<<GridLogMessage <<"Gauge links are u=1  "<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage <<"u0="<<u0<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage <<"M5=M5mf =  "<<M5<<std::endl; |  | ||||||
|     LLscale = 1.0/(1-w0*w0)/(1-w0*w0); |  | ||||||
|     LCscale = 1.0/(1-w0*w0)/(1-w0*w0); |  | ||||||
| #endif |  | ||||||
|     std::cout<<GridLogMessage <<"LLscale =  "<<LLscale<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage <<"LCscale =  "<<LCscale<<std::endl; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   std::vector<RealD> masses({ 0.00} ); // u/d, s, c ?? |  | ||||||
|  |  | ||||||
|   int nmass = masses.size(); |  | ||||||
|  |  | ||||||
|   std::vector<MobiusFermionD *> FermActs; |  | ||||||
|    |  | ||||||
|   std::cout<<GridLogMessage <<"======================"<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage <<"======================"<<std::endl; |  | ||||||
|  |  | ||||||
|   for(auto mass: masses) { |  | ||||||
|  |  | ||||||
|     RealD b=1.5;// Scale factor b+c=2, b-c=1 |  | ||||||
|     RealD c=0.5; |  | ||||||
|      |  | ||||||
|     FermActs.push_back(new MobiusFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c)); |  | ||||||
|     |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   LatticePropagator point_source(UGrid); |  | ||||||
|   //  LatticePropagator wall_source(UGrid); |  | ||||||
|  |  | ||||||
|   Coordinate Origin({0,0,0,0}); |  | ||||||
|   PointSource   (Origin,point_source); |  | ||||||
|   //  Z2WallSource  (RNG4,0,wall_source); |  | ||||||
|    |  | ||||||
|   std::vector<LatticePropagator> PointProps(nmass,UGrid); |  | ||||||
|   //  std::vector<LatticePropagator> GaussProps(nmass,UGrid); |  | ||||||
|   //  std::vector<LatticePropagator> Z2Props   (nmass,UGrid); |  | ||||||
|  |  | ||||||
|   for(int m=0;m<nmass;m++) { |  | ||||||
|      |  | ||||||
|     Solve(*FermActs[m],point_source   ,PointProps[m]); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   LatticeComplex phase(UGrid); |  | ||||||
|   Coordinate mom({0,0,0,0}); |  | ||||||
|   MakePhase(mom,phase); |  | ||||||
|    |  | ||||||
|   for(int m1=0 ;m1<nmass;m1++) { |  | ||||||
|   for(int m2=m1;m2<nmass;m2++) { |  | ||||||
|     std::stringstream ssp,ssg,ssz; |  | ||||||
|  |  | ||||||
|     ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml"; |  | ||||||
|     ssz<<config<< "_m" << m1 << "_m"<< m2 << "_wall_meson.xml"; |  | ||||||
|  |  | ||||||
|     MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase); |  | ||||||
|     //    MesonTrace(ssz.str(),Z2Props[m1],Z2Props[m2],phase); |  | ||||||
|   }} |  | ||||||
|  |  | ||||||
|   Grid_finalize(); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -1,479 +0,0 @@ | |||||||
| /* |  | ||||||
|  * Warning: This code illustrative only: not well tested, and not meant for production use |  | ||||||
|  * without regression / tests being applied |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #include <Grid/Grid.h> |  | ||||||
|  |  | ||||||
| using namespace std; |  | ||||||
| using namespace Grid; |  | ||||||
|  |  | ||||||
| RealD LLscale =1.0; |  | ||||||
| RealD LCscale =1.0; |  | ||||||
|  |  | ||||||
| template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field> |  | ||||||
| { |  | ||||||
| public: |  | ||||||
|   INHERIT_GIMPL_TYPES(Gimpl); |  | ||||||
|  |  | ||||||
|   GridBase *grid; |  | ||||||
|   GaugeField U; |  | ||||||
|    |  | ||||||
|   CovariantLaplacianCshift(GaugeField &_U)    : |  | ||||||
|     grid(_U.Grid()), |  | ||||||
|     U(_U) {  }; |  | ||||||
|  |  | ||||||
|   virtual GridBase *Grid(void) { return grid; }; |  | ||||||
|  |  | ||||||
|   virtual void  M    (const Field &in, Field &out) |  | ||||||
|   { |  | ||||||
|     out=Zero(); |  | ||||||
|     for(int mu=0;mu<Nd-1;mu++) { |  | ||||||
|       GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent |  | ||||||
|       out = out - Gimpl::CovShiftForward(Umu,mu,in);     |  | ||||||
|       out = out - Gimpl::CovShiftBackward(Umu,mu,in);     |  | ||||||
|       out = out + 2.0*in; |  | ||||||
|     } |  | ||||||
|   }; |  | ||||||
|   virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian |  | ||||||
|   virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid |  | ||||||
|   virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid |  | ||||||
|   virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| void MakePhase(Coordinate mom,LatticeComplex &phase) |  | ||||||
| { |  | ||||||
|   GridBase *grid = phase.Grid(); |  | ||||||
|   auto latt_size = grid->GlobalDimensions(); |  | ||||||
|   ComplexD ci(0.0,1.0); |  | ||||||
|   phase=Zero(); |  | ||||||
|  |  | ||||||
|   LatticeComplex coor(phase.Grid()); |  | ||||||
|   for(int mu=0;mu<Nd;mu++){ |  | ||||||
|     RealD TwoPiL =  M_PI * 2.0/ latt_size[mu]; |  | ||||||
|     LatticeCoordinate(coor,mu); |  | ||||||
|     phase = phase + (TwoPiL * mom[mu]) * coor; |  | ||||||
|   } |  | ||||||
|   phase = exp(phase*ci); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void PointSource(Coordinate &coor,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   //  Coordinate coor({0,0,0,0}); |  | ||||||
|   source=Zero(); |  | ||||||
|   SpinColourMatrix kronecker; kronecker=1.0; |  | ||||||
|   pokeSite(kronecker,source,coor); |  | ||||||
| } |  | ||||||
| void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   GridBase *grid = source.Grid(); |  | ||||||
|   LatticeComplex noise(grid); |  | ||||||
|   LatticeComplex zz(grid); zz=Zero(); |  | ||||||
|   LatticeInteger t(grid); |  | ||||||
|  |  | ||||||
|   RealD nrm=1.0/sqrt(2); |  | ||||||
|   bernoulli(RNG, noise); // 0,1 50:50 |  | ||||||
|  |  | ||||||
|   noise = (2.*noise - Complex(1,1))*nrm; |  | ||||||
|  |  | ||||||
|   LatticeCoordinate(t,Tdir); |  | ||||||
|   noise = where(t==Integer(tslice), noise, zz); |  | ||||||
|  |  | ||||||
|   source = 1.0; |  | ||||||
|   source = source*noise; |  | ||||||
|   std::cout << " Z2 wall " << norm2(source) << std::endl; |  | ||||||
| } |  | ||||||
| template<class Field> |  | ||||||
| void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) |  | ||||||
| { |  | ||||||
|   typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t; |  | ||||||
|   Laplacian_t Laplacian(U); |  | ||||||
|  |  | ||||||
|   Integer Iterations = 40; |  | ||||||
|   Real width = 2.0; |  | ||||||
|   Real coeff = (width*width) / Real(4*Iterations); |  | ||||||
|  |  | ||||||
|   Field tmp(U.Grid()); |  | ||||||
|   smeared=unsmeared; |  | ||||||
|   //  chi = (1-p^2/2N)^N kronecker |  | ||||||
|   for(int n = 0; n < Iterations; ++n) { |  | ||||||
|     Laplacian.M(smeared,tmp); |  | ||||||
|     smeared = smeared - coeff*tmp; |  | ||||||
|     std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   LatticePropagator tmp(source.Grid()); |  | ||||||
|   PointSource(site,source); |  | ||||||
|   std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl; |  | ||||||
|   tmp = source; |  | ||||||
|   GaussianSmear(U,tmp,source); |  | ||||||
|   std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl; |  | ||||||
| } |  | ||||||
| void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   Z2WallSource(RNG,tslice,source); |  | ||||||
|   auto tmp = source; |  | ||||||
|   GaussianSmear(U,tmp,source); |  | ||||||
| } |  | ||||||
| void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   assert(mom.size()==Nd); |  | ||||||
|   assert(mom[Tdir] == 0); |  | ||||||
|  |  | ||||||
|   GridBase * grid = spectator.Grid(); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   LatticeInteger ts(grid); |  | ||||||
|   LatticeCoordinate(ts,Tdir); |  | ||||||
|   source = Zero(); |  | ||||||
|   source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else |  | ||||||
|  |  | ||||||
|   LatticeComplex phase(grid); |  | ||||||
|   MakePhase(mom,phase); |  | ||||||
|  |  | ||||||
|   source = source *phase; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<class Action> |  | ||||||
| void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator) |  | ||||||
| {			    |  | ||||||
|  GridBase *UGrid = source.Grid(); |  | ||||||
|   GridBase *FGrid = D.FermionGrid(); |  | ||||||
|   bool fiveD = true; //calculate 5d free propagator |  | ||||||
|   RealD mass = D.Mass(); |  | ||||||
|   LatticeFermion src4  (UGrid); |  | ||||||
|   LatticeFermion result4  (UGrid); |  | ||||||
|   LatticeFermion result5(FGrid); |  | ||||||
|   LatticeFermion src5(FGrid); |  | ||||||
|   LatticePropagator prop5(FGrid); |  | ||||||
|   for(int s=0;s<Nd;s++){ |  | ||||||
|     for(int c=0;c<Nc;c++){ |  | ||||||
|   |  | ||||||
|       PropToFerm<Action>(src4,source,s,c); |  | ||||||
|  |  | ||||||
|       D.ImportPhysicalFermionSource(src4,src5); |  | ||||||
|       D.FreePropagator(src5,result5,mass,true); |  | ||||||
|       std::cout<<GridLogMessage |  | ||||||
|                <<"Free 5D prop spin "<<s<<" color "<<c |  | ||||||
|                <<" norm2(src5d) "   <<norm2(src5) |  | ||||||
|                <<" norm2(result5d) "<<norm2(result5)<<std::endl; |  | ||||||
|  |  | ||||||
|       D.ExportPhysicalFermionSolution(result5,result4); |  | ||||||
|  |  | ||||||
|       FermToProp<Action>(prop5,result5,s,c); |  | ||||||
|       FermToProp<Action>(propagator,result4,s,c); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   LatticePropagator Vector_mu(UGrid); |  | ||||||
|   LatticeComplex    VV (UGrid); |  | ||||||
|   std::vector<TComplex> sumVV; |  | ||||||
|   Gamma::Algebra GammaV[3] = { |  | ||||||
|     Gamma::Algebra::GammaX, |  | ||||||
|     Gamma::Algebra::GammaY, |  | ||||||
|     Gamma::Algebra::GammaZ |  | ||||||
|   }; |  | ||||||
|   for( int mu=0;mu<3;mu++ ) { |  | ||||||
|     Gamma gV(GammaV[mu]); |  | ||||||
|     D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu); |  | ||||||
|     VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current |  | ||||||
|     sliceSum(VV,sumVV,Tdir); |  | ||||||
|     int Nt = sumVV.size(); |  | ||||||
|     for(int t=0;t<Nt;t++){ |  | ||||||
|       RealD Ct = real(TensorRemove(sumVV[t]))*LCscale; |  | ||||||
|       RealD Cont=0; |  | ||||||
|       if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t); |  | ||||||
|       std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct |  | ||||||
|                << " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| template<class Action> |  | ||||||
| void MasslessFreePropagator1(Action &D,LatticePropagator &source,LatticePropagator &propagator) |  | ||||||
| {			    |  | ||||||
|   bool fiveD = false; //calculate 4d free propagator |  | ||||||
|   RealD mass = D.Mass(); |  | ||||||
|   GridBase *UGrid = source.Grid(); |  | ||||||
|   LatticeFermion src4  (UGrid);  |  | ||||||
|   LatticeFermion result4  (UGrid);  |  | ||||||
|   for(int s=0;s<Nd;s++){ |  | ||||||
|     for(int c=0;c<Nc;c++){ |  | ||||||
|       PropToFerm<Action>(src4,source,s,c); |  | ||||||
|       D.FreePropagator(src4,result4,mass,false); |  | ||||||
|       FermToProp<Action>(propagator,result4,s,c); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<class Action> |  | ||||||
| void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) |  | ||||||
| { |  | ||||||
|   GridBase *UGrid = D.GaugeGrid(); |  | ||||||
|   GridBase *FGrid = D.FermionGrid(); |  | ||||||
|  |  | ||||||
|   LatticeFermion src4  (UGrid);  |  | ||||||
|   LatticeFermion src5  (FGrid);  |  | ||||||
|   LatticeFermion result5(FGrid); |  | ||||||
|   LatticeFermion result4(UGrid); |  | ||||||
|   LatticePropagator prop5(FGrid); |  | ||||||
|    |  | ||||||
|   ConjugateGradient<LatticeFermion> CG(1.0e-10,100000); |  | ||||||
|   SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG); |  | ||||||
|   ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors |  | ||||||
|    for(int s=0;s<Nd;s++){ |  | ||||||
|     for(int c=0;c<Nc;c++){ |  | ||||||
|       PropToFerm<Action>(src4,source,s,c); |  | ||||||
|  |  | ||||||
|       D.ImportPhysicalFermionSource(src4,src5); |  | ||||||
|  |  | ||||||
|       result5=Zero(); |  | ||||||
|       schur(D,src5,result5,ZG); |  | ||||||
|       std::cout<<GridLogMessage |  | ||||||
| 	       <<"spin "<<s<<" color "<<c |  | ||||||
| 	       <<" norm2(src5d) "   <<norm2(src5) |  | ||||||
|                <<" norm2(result5d) "<<norm2(result5)<<std::endl; |  | ||||||
|  |  | ||||||
|       D.ExportPhysicalFermionSolution(result5,result4); |  | ||||||
|  |  | ||||||
|       FermToProp<Action>(prop5,result5,s,c); |  | ||||||
|       FermToProp<Action>(propagator,result4,s,c); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   LatticePropagator Axial_mu(UGrid);  |  | ||||||
|   LatticePropagator Vector_mu(UGrid);  |  | ||||||
|  |  | ||||||
|   LatticeComplex    PA (UGrid);  |  | ||||||
|   LatticeComplex    VV (UGrid);  |  | ||||||
|   LatticeComplex    PJ5q(UGrid); |  | ||||||
|   LatticeComplex    PP (UGrid); |  | ||||||
|  |  | ||||||
|   std::vector<TComplex> sumPA; |  | ||||||
|   std::vector<TComplex> sumVV; |  | ||||||
|   std::vector<TComplex> sumPP; |  | ||||||
|   std::vector<TComplex> sumPJ5q; |  | ||||||
|  |  | ||||||
|   Gamma g5(Gamma::Algebra::Gamma5); |  | ||||||
|   D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir); |  | ||||||
|   PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current |  | ||||||
|   sliceSum(PA,sumPA,Tdir); |  | ||||||
|  |  | ||||||
|   int Nt{static_cast<int>(sumPA.size())}; |  | ||||||
|  |  | ||||||
|   for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl; |  | ||||||
|  |  | ||||||
|   PP       = trace(adj(propagator)*propagator); // Pseudoscalar density |  | ||||||
|   sliceSum(PP,sumPP,Tdir); |  | ||||||
|   for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl; |  | ||||||
|    |  | ||||||
|   D.ContractJ5q(prop5,PJ5q); |  | ||||||
|   sliceSum(PJ5q,sumPJ5q,Tdir); |  | ||||||
|   for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl; |  | ||||||
|  |  | ||||||
|   Gamma::Algebra GammaV[3] = { |  | ||||||
|     Gamma::Algebra::GammaX, |  | ||||||
|     Gamma::Algebra::GammaY, |  | ||||||
|     Gamma::Algebra::GammaZ |  | ||||||
|   }; |  | ||||||
|   for( int mu=0;mu<3;mu++ ) { |  | ||||||
|     Gamma gV(GammaV[mu]); |  | ||||||
|     D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu); |  | ||||||
|     //    auto ss=sliceSum(Vector_mu,Tdir); |  | ||||||
|     //    for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl; |  | ||||||
|     VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current |  | ||||||
|     sliceSum(VV,sumVV,Tdir); |  | ||||||
|     for(int t=0;t<Nt;t++){ |  | ||||||
|       RealD Ct = real(TensorRemove(sumVV[t]))*LCscale; |  | ||||||
|       RealD Cont=0; |  | ||||||
|       if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t); |  | ||||||
|       std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct |  | ||||||
|                << " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| class MesonFile: Serializable { |  | ||||||
| public: |  | ||||||
|   GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data); |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) |  | ||||||
| { |  | ||||||
|   const int nchannel=4; |  | ||||||
|   Gamma::Algebra Gammas[nchannel][2] = { |  | ||||||
|     {Gamma::Algebra::GammaXGamma5,Gamma::Algebra::GammaXGamma5}, |  | ||||||
|     {Gamma::Algebra::GammaYGamma5,Gamma::Algebra::GammaYGamma5}, |  | ||||||
|     {Gamma::Algebra::GammaZGamma5,Gamma::Algebra::GammaZGamma5}, |  | ||||||
|     {Gamma::Algebra::Identity,Gamma::Algebra::Identity} |  | ||||||
|   }; |  | ||||||
|  |  | ||||||
|   LatticeComplex meson_CF(q1.Grid()); |  | ||||||
|   MesonFile MF; |  | ||||||
|  |  | ||||||
|   for(int ch=0;ch<nchannel;ch++){ |  | ||||||
|  |  | ||||||
|     Gamma Gsrc(Gammas[ch][0]); |  | ||||||
|     Gamma Gsnk(Gammas[ch][1]); |  | ||||||
|  |  | ||||||
|     meson_CF = trace(adj(q1)*Gsnk*q2*adj(Gsrc)); |  | ||||||
|  |  | ||||||
|     std::vector<TComplex> meson_T; |  | ||||||
|     sliceSum(meson_CF,meson_T, Tdir); |  | ||||||
|  |  | ||||||
|     int nt=meson_T.size(); |  | ||||||
|  |  | ||||||
|     std::vector<Complex> corr(nt); |  | ||||||
|     for(int t=0;t<nt;t++){ |  | ||||||
|       corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around |  | ||||||
|       RealD Ct = real(corr[t]); |  | ||||||
|       RealD Cont=0; |  | ||||||
|       if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t); |  | ||||||
|       std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t * Ct |  | ||||||
| 		<< " deltaC " <<Ct-Cont<<std::endl; |  | ||||||
|     } |  | ||||||
|     MF.data.push_back(corr); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   { |  | ||||||
|     XmlWriter WR(file); |  | ||||||
|     write(WR,"MesonFile",MF); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| int main (int argc, char ** argv) |  | ||||||
| { |  | ||||||
|   const int Ls=10; |  | ||||||
|  |  | ||||||
|   Grid_init(&argc,&argv); |  | ||||||
|  |  | ||||||
|   // Double precision grids |  | ||||||
|   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),  |  | ||||||
| 								   GridDefaultSimd(Nd,vComplex::Nsimd()), |  | ||||||
| 								   GridDefaultMpi()); |  | ||||||
|   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); |  | ||||||
|   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); |  | ||||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); |  | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////////////////////////////// |  | ||||||
|   // You can manage seeds however you like. |  | ||||||
|   // Recommend SeedUniqueString. |  | ||||||
|   ////////////////////////////////////////////////////////////////////// |  | ||||||
|   //  std::vector<int> seeds4({1,2,3,4});  |  | ||||||
|   //  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); |  | ||||||
|  |  | ||||||
|   LatticeGaugeField Umu(UGrid); |  | ||||||
|   std::string config; |  | ||||||
|   RealD M5=atof(getenv("M5")); |  | ||||||
|   RealD mq = atof(getenv("mass")); |  | ||||||
|   int   tadpole = atof(getenv("tadpole")); |  | ||||||
|   std::vector<RealD> masses({ mq} ); // u/d, s, c ?? |  | ||||||
|   if( argc > 1 && argv[1][0] != '-' ) |  | ||||||
|   { |  | ||||||
|     std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl; |  | ||||||
|     FieldMetaData header; |  | ||||||
|     NerscIO::readConfiguration(Umu, header, argv[1]); |  | ||||||
|     config=argv[1]; |  | ||||||
|     LLscale = 1.0; |  | ||||||
|     LCscale = 1.0; |  | ||||||
|   } |  | ||||||
|   else |  | ||||||
|   { |  | ||||||
|     SU<Nc>::ColdConfiguration(Umu); |  | ||||||
|     config="ColdConfig"; |  | ||||||
|     //    RealD P=1.0; // Don't scale |  | ||||||
|     //    RealD P=0.6388238 // 32Ifine |  | ||||||
|     //    RealD P=0.6153342; // 64I |  | ||||||
|     RealD P=0.5871119; // 48I |  | ||||||
|     RealD u0 = sqrt(sqrt(P)); |  | ||||||
|     RealD w0 = 1 - M5; |  | ||||||
|     std::cout<<GridLogMessage <<"For plaquette P="<<P<<" u0= "<<u0<<std::endl; |  | ||||||
|     if ( tadpole == 1 ) { |  | ||||||
|       Umu = Umu * u0; |  | ||||||
|       //      LLscale = 1.0/(1-w0*w0)/(1-w0*w0)/u0/u0; |  | ||||||
|       //      LCscale = 1.0/(1-w0*w0)/(1-w0*w0)/u0/u0; |  | ||||||
|       LLscale = 1.0; |  | ||||||
|       LCscale = 1.0; |  | ||||||
|       std::cout<<GridLogMessage <<"Gauge links are u= u0 "<<std::endl; |  | ||||||
|       std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl; |  | ||||||
|     } else if ( tadpole == 2) { |  | ||||||
|       std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl; |  | ||||||
|       LLscale = 1.0; |  | ||||||
|       LCscale = 1.0; |  | ||||||
|       std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl; |  | ||||||
|     } else { |  | ||||||
|       LLscale = 1.0/u0/u0; |  | ||||||
|       LCscale = 1.0/u0/u0; |  | ||||||
|       M5 = M5 - 4.0 * (1-u0); |  | ||||||
|       std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl; |  | ||||||
|       std::cout<<GridLogMessage <<"M5mf =  "<<M5<<std::endl; |  | ||||||
|     } |  | ||||||
|     std::cout<<GridLogMessage <<"mq =  "<<mq<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage <<"LLscale =  "<<LLscale<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage <<"LCscale =  "<<LCscale<<std::endl; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   int nmass = masses.size(); |  | ||||||
|  |  | ||||||
|   typedef DomainWallFermionD FermionActionD; |  | ||||||
|   //  typedef MobiusFermionD FermionActionD; |  | ||||||
|   std::vector<FermionActionD *> FermActs; |  | ||||||
|   std::vector<DomainWallFermionD *> DWFActs; |  | ||||||
|    |  | ||||||
|   std::cout<<GridLogMessage <<"======================"<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage <<"DomainWallFermion action"<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage <<"======================"<<std::endl; |  | ||||||
|  |  | ||||||
|   for(auto mass: masses) { |  | ||||||
|     std::vector<Complex> boundary = {1,1,1,-1}; |  | ||||||
|     FermionActionD::ImplParams Params(boundary); |  | ||||||
|     RealD b=1.5; |  | ||||||
|     RealD c=0.5; |  | ||||||
|     std::cout<<GridLogMessage <<"Making DomainWallFermion action"<<std::endl; |  | ||||||
|     //    DWFActs.push_back(new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5)); |  | ||||||
|     FermActs.push_back(new FermionActionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params)); |  | ||||||
|     //    FermActs.push_back(new FermionActionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass+0.001,M5,b,c)); |  | ||||||
|     std::cout<<GridLogMessage <<"Made DomainWallFermion action"<<std::endl; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   LatticePropagator point_source(UGrid); |  | ||||||
|  |  | ||||||
|   Coordinate Origin({0,0,0,0}); |  | ||||||
|   PointSource   (Origin,point_source); |  | ||||||
|    |  | ||||||
|   std::vector<LatticePropagator> PointProps(nmass,UGrid); |  | ||||||
|   //  std::vector<LatticePropagator> FreeProps(nmass,UGrid); |  | ||||||
|   //  LatticePropagator delta(UGrid); |  | ||||||
|  |  | ||||||
|   for(int m=0;m<nmass;m++) { |  | ||||||
|     Solve(*FermActs[m],point_source   ,PointProps[m]); |  | ||||||
|     //    MasslessFreePropagator(*FermActs[m],point_source   ,FreeProps[m]); |  | ||||||
|  |  | ||||||
|     //    delta = PointProps[m] - FreeProps[m]; |  | ||||||
|     //    std::cout << " delta "<<norm2(delta) << " FFT "<<norm2(FreeProps[m])<< " CG " <<norm2(PointProps[m])<<std::endl; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   LatticeComplex phase(UGrid); |  | ||||||
|   Coordinate mom({0,0,0,0}); |  | ||||||
|   MakePhase(mom,phase); |  | ||||||
|    |  | ||||||
|   for(int m1=0 ;m1<nmass;m1++) { |  | ||||||
|   for(int m2=m1;m2<nmass;m2++) { |  | ||||||
|     std::stringstream ssp,ssg,ssz; |  | ||||||
|  |  | ||||||
|     ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml"; |  | ||||||
|     ssz<<config<< "_m" << m1 << "_m"<< m2 << "_free_meson.xml"; |  | ||||||
|  |  | ||||||
|     std::cout << "CG determined VV correlation function"<<std::endl; |  | ||||||
|     MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase); |  | ||||||
|      |  | ||||||
|     //    std::cout << "FFT derived VV correlation function"<<std::endl; |  | ||||||
|     //    MesonTrace(ssz.str(),FreeProps[m1],FreeProps[m2],phase); |  | ||||||
|   }} |  | ||||||
|  |  | ||||||
|   Grid_finalize(); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -1,433 +0,0 @@ | |||||||
| /* |  | ||||||
|  * Warning: This code illustrative only: not well tested, and not meant for production use |  | ||||||
|  * without regression / tests being applied |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #include <Grid/Grid.h> |  | ||||||
|  |  | ||||||
| using namespace std; |  | ||||||
| using namespace Grid; |  | ||||||
|  |  | ||||||
| RealD LLscale =1.0; |  | ||||||
| RealD LCscale =1.0; |  | ||||||
|  |  | ||||||
| template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field> |  | ||||||
| { |  | ||||||
| public: |  | ||||||
|   INHERIT_GIMPL_TYPES(Gimpl); |  | ||||||
|  |  | ||||||
|   GridBase *grid; |  | ||||||
|   GaugeField U; |  | ||||||
|    |  | ||||||
|   CovariantLaplacianCshift(GaugeField &_U)    : |  | ||||||
|     grid(_U.Grid()), |  | ||||||
|     U(_U) {  }; |  | ||||||
|  |  | ||||||
|   virtual GridBase *Grid(void) { return grid; }; |  | ||||||
|  |  | ||||||
|   virtual void  M    (const Field &in, Field &out) |  | ||||||
|   { |  | ||||||
|     out=Zero(); |  | ||||||
|     for(int mu=0;mu<Nd-1;mu++) { |  | ||||||
|       GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent |  | ||||||
|       out = out - Gimpl::CovShiftForward(Umu,mu,in);     |  | ||||||
|       out = out - Gimpl::CovShiftBackward(Umu,mu,in);     |  | ||||||
|       out = out + 2.0*in; |  | ||||||
|     } |  | ||||||
|   }; |  | ||||||
|   virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian |  | ||||||
|   virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid |  | ||||||
|   virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid |  | ||||||
|   virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| void MakePhase(Coordinate mom,LatticeComplex &phase) |  | ||||||
| { |  | ||||||
|   GridBase *grid = phase.Grid(); |  | ||||||
|   auto latt_size = grid->GlobalDimensions(); |  | ||||||
|   ComplexD ci(0.0,1.0); |  | ||||||
|   phase=Zero(); |  | ||||||
|  |  | ||||||
|   LatticeComplex coor(phase.Grid()); |  | ||||||
|   for(int mu=0;mu<Nd;mu++){ |  | ||||||
|     RealD TwoPiL =  M_PI * 2.0/ latt_size[mu]; |  | ||||||
|     LatticeCoordinate(coor,mu); |  | ||||||
|     phase = phase + (TwoPiL * mom[mu]) * coor; |  | ||||||
|   } |  | ||||||
|   phase = exp(phase*ci); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void PointSource(Coordinate &coor,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   //  Coordinate coor({0,0,0,0}); |  | ||||||
|   source=Zero(); |  | ||||||
|   SpinColourMatrix kronecker; kronecker=1.0; |  | ||||||
|   pokeSite(kronecker,source,coor); |  | ||||||
| } |  | ||||||
| void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   GridBase *grid = source.Grid(); |  | ||||||
|   LatticeComplex noise(grid); |  | ||||||
|   LatticeComplex zz(grid); zz=Zero(); |  | ||||||
|   LatticeInteger t(grid); |  | ||||||
|  |  | ||||||
|   RealD nrm=1.0/sqrt(2); |  | ||||||
|   bernoulli(RNG, noise); // 0,1 50:50 |  | ||||||
|  |  | ||||||
|   noise = (2.*noise - Complex(1,1))*nrm; |  | ||||||
|  |  | ||||||
|   LatticeCoordinate(t,Tdir); |  | ||||||
|   noise = where(t==Integer(tslice), noise, zz); |  | ||||||
|  |  | ||||||
|   source = 1.0; |  | ||||||
|   source = source*noise; |  | ||||||
|   std::cout << " Z2 wall " << norm2(source) << std::endl; |  | ||||||
| } |  | ||||||
| template<class Field> |  | ||||||
| void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) |  | ||||||
| { |  | ||||||
|   typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t; |  | ||||||
|   Laplacian_t Laplacian(U); |  | ||||||
|  |  | ||||||
|   Integer Iterations = 40; |  | ||||||
|   Real width = 2.0; |  | ||||||
|   Real coeff = (width*width) / Real(4*Iterations); |  | ||||||
|  |  | ||||||
|   Field tmp(U.Grid()); |  | ||||||
|   smeared=unsmeared; |  | ||||||
|   //  chi = (1-p^2/2N)^N kronecker |  | ||||||
|   for(int n = 0; n < Iterations; ++n) { |  | ||||||
|     Laplacian.M(smeared,tmp); |  | ||||||
|     smeared = smeared - coeff*tmp; |  | ||||||
|     std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   LatticePropagator tmp(source.Grid()); |  | ||||||
|   PointSource(site,source); |  | ||||||
|   std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl; |  | ||||||
|   tmp = source; |  | ||||||
|   GaussianSmear(U,tmp,source); |  | ||||||
|   std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl; |  | ||||||
| } |  | ||||||
| void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   Z2WallSource(RNG,tslice,source); |  | ||||||
|   auto tmp = source; |  | ||||||
|   GaussianSmear(U,tmp,source); |  | ||||||
| } |  | ||||||
| void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source) |  | ||||||
| { |  | ||||||
|   assert(mom.size()==Nd); |  | ||||||
|   assert(mom[Tdir] == 0); |  | ||||||
|  |  | ||||||
|   GridBase * grid = spectator.Grid(); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   LatticeInteger ts(grid); |  | ||||||
|   LatticeCoordinate(ts,Tdir); |  | ||||||
|   source = Zero(); |  | ||||||
|   source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else |  | ||||||
|  |  | ||||||
|   LatticeComplex phase(grid); |  | ||||||
|   MakePhase(mom,phase); |  | ||||||
|  |  | ||||||
|   source = source *phase; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<class Action> |  | ||||||
| void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator) |  | ||||||
| {			    |  | ||||||
|  GridBase *UGrid = source.Grid(); |  | ||||||
|   GridBase *FGrid = D.FermionGrid(); |  | ||||||
|   bool fiveD = true; //calculate 4d free propagator                                                                                                                  |  | ||||||
|   RealD mass = D.Mass(); |  | ||||||
|   LatticeFermion src4  (UGrid); |  | ||||||
|   LatticeFermion result4  (UGrid); |  | ||||||
|   LatticeFermion result5(FGrid); |  | ||||||
|   LatticeFermion src5(FGrid); |  | ||||||
|   LatticePropagator prop5(FGrid); |  | ||||||
|   for(int s=0;s<Nd;s++){ |  | ||||||
|     for(int c=0;c<Nc;c++){ |  | ||||||
|   |  | ||||||
|       PropToFerm<Action>(src4,source,s,c); |  | ||||||
|  |  | ||||||
|       D.ImportPhysicalFermionSource(src4,src5); |  | ||||||
|       D.FreePropagator(src5,result5,mass,true); |  | ||||||
|       std::cout<<GridLogMessage |  | ||||||
|                <<"spin "<<s<<" color "<<c |  | ||||||
|                <<" norm2(src5d) "   <<norm2(src5) |  | ||||||
|                <<" norm2(result5d) "<<norm2(result5)<<std::endl; |  | ||||||
|  |  | ||||||
|       D.ExportPhysicalFermionSolution(result5,result4); |  | ||||||
|  |  | ||||||
|       FermToProp<Action>(prop5,result5,s,c); |  | ||||||
|       FermToProp<Action>(propagator,result4,s,c); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   LatticePropagator Vector_mu(UGrid); |  | ||||||
|   LatticeComplex    VV (UGrid); |  | ||||||
|   std::vector<TComplex> sumVV; |  | ||||||
|   Gamma::Algebra GammaV[3] = { |  | ||||||
|     Gamma::Algebra::GammaX, |  | ||||||
|     Gamma::Algebra::GammaY, |  | ||||||
|     Gamma::Algebra::GammaZ |  | ||||||
|   }; |  | ||||||
|   for( int mu=0;mu<3;mu++ ) { |  | ||||||
|     Gamma gV(GammaV[mu]); |  | ||||||
|     D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu); |  | ||||||
|     VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current |  | ||||||
|     sliceSum(VV,sumVV,Tdir); |  | ||||||
|     int Nt = sumVV.size(); |  | ||||||
|     for(int t=0;t<Nt;t++){ |  | ||||||
|       RealD Ct = real(TensorRemove(sumVV[t]))*LCscale; |  | ||||||
|       std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct |  | ||||||
|                << " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<class Action> |  | ||||||
| void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) |  | ||||||
| { |  | ||||||
|   GridBase *UGrid = D.GaugeGrid(); |  | ||||||
|   GridBase *FGrid = D.FermionGrid(); |  | ||||||
|  |  | ||||||
|   LatticeFermion src4  (UGrid);  |  | ||||||
|   LatticeFermion src5  (FGrid);  |  | ||||||
|   LatticeFermion result5(FGrid); |  | ||||||
|   LatticeFermion result4(UGrid); |  | ||||||
|   LatticePropagator prop5(FGrid); |  | ||||||
|    |  | ||||||
|   ConjugateGradient<LatticeFermion> CG(1.0e-6,100000); |  | ||||||
|   SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG); |  | ||||||
|   ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors |  | ||||||
|    for(int s=0;s<Nd;s++){ |  | ||||||
|     for(int c=0;c<Nc;c++){ |  | ||||||
|       PropToFerm<Action>(src4,source,s,c); |  | ||||||
|  |  | ||||||
|       D.ImportPhysicalFermionSource(src4,src5); |  | ||||||
|  |  | ||||||
|       result5=Zero(); |  | ||||||
|       schur(D,src5,result5,ZG); |  | ||||||
|       std::cout<<GridLogMessage |  | ||||||
| 	       <<"spin "<<s<<" color "<<c |  | ||||||
| 	       <<" norm2(src5d) "   <<norm2(src5) |  | ||||||
|                <<" norm2(result5d) "<<norm2(result5)<<std::endl; |  | ||||||
|  |  | ||||||
|       D.ExportPhysicalFermionSolution(result5,result4); |  | ||||||
|  |  | ||||||
|       FermToProp<Action>(prop5,result5,s,c); |  | ||||||
|       FermToProp<Action>(propagator,result4,s,c); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   LatticePropagator Axial_mu(UGrid);  |  | ||||||
|   LatticePropagator Vector_mu(UGrid);  |  | ||||||
|  |  | ||||||
|   LatticeComplex    PA (UGrid);  |  | ||||||
|   LatticeComplex    VV (UGrid);  |  | ||||||
|   LatticeComplex    PJ5q(UGrid); |  | ||||||
|   LatticeComplex    PP (UGrid); |  | ||||||
|  |  | ||||||
|   std::vector<TComplex> sumPA; |  | ||||||
|   std::vector<TComplex> sumVV; |  | ||||||
|   std::vector<TComplex> sumPP; |  | ||||||
|   std::vector<TComplex> sumPJ5q; |  | ||||||
|  |  | ||||||
|   Gamma g5(Gamma::Algebra::Gamma5); |  | ||||||
|   D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir); |  | ||||||
|   PA       = trace(g5*Axial_mu);      // Pseudoscalar-Axial conserved current |  | ||||||
|   sliceSum(PA,sumPA,Tdir); |  | ||||||
|  |  | ||||||
|   int Nt{static_cast<int>(sumPA.size())}; |  | ||||||
|  |  | ||||||
|   for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl; |  | ||||||
|  |  | ||||||
|   PP       = trace(adj(propagator)*propagator); // Pseudoscalar density |  | ||||||
|   sliceSum(PP,sumPP,Tdir); |  | ||||||
|   for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl; |  | ||||||
|    |  | ||||||
|   D.ContractJ5q(prop5,PJ5q); |  | ||||||
|   sliceSum(PJ5q,sumPJ5q,Tdir); |  | ||||||
|   for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl; |  | ||||||
|  |  | ||||||
|   Gamma::Algebra GammaV[3] = { |  | ||||||
|     Gamma::Algebra::GammaX, |  | ||||||
|     Gamma::Algebra::GammaY, |  | ||||||
|     Gamma::Algebra::GammaZ |  | ||||||
|   }; |  | ||||||
|   for( int mu=0;mu<3;mu++ ) { |  | ||||||
|     Gamma gV(GammaV[mu]); |  | ||||||
|     D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu); |  | ||||||
|     //    auto ss=sliceSum(Vector_mu,Tdir); |  | ||||||
|     //    for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl; |  | ||||||
|     VV       = trace(gV*Vector_mu);     // (local) Vector-Vector conserved current |  | ||||||
|     sliceSum(VV,sumVV,Tdir); |  | ||||||
|     for(int t=0;t<Nt;t++){ |  | ||||||
|       RealD Ct = real(TensorRemove(sumVV[t]))*LCscale; |  | ||||||
|       std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct |  | ||||||
| 	       << " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| class MesonFile: Serializable { |  | ||||||
| public: |  | ||||||
|   GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data); |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) |  | ||||||
| { |  | ||||||
|   const int nchannel=3; |  | ||||||
|   Gamma::Algebra Gammas[nchannel][2] = { |  | ||||||
|     {Gamma::Algebra::GammaX,Gamma::Algebra::GammaX}, |  | ||||||
|     {Gamma::Algebra::GammaY,Gamma::Algebra::GammaY}, |  | ||||||
|     //    {Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ} |  | ||||||
|     {Gamma::Algebra::Gamma5,Gamma::Algebra::Gamma5} |  | ||||||
|   }; |  | ||||||
|  |  | ||||||
|   Gamma G5(Gamma::Algebra::Gamma5); |  | ||||||
|  |  | ||||||
|   LatticeComplex meson_CF(q1.Grid()); |  | ||||||
|   MesonFile MF; |  | ||||||
|  |  | ||||||
|   for(int ch=0;ch<nchannel;ch++){ |  | ||||||
|  |  | ||||||
|     Gamma Gsrc(Gammas[ch][0]); |  | ||||||
|     Gamma Gsnk(Gammas[ch][1]); |  | ||||||
|  |  | ||||||
|     meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc)); |  | ||||||
|  |  | ||||||
|     std::vector<TComplex> meson_T; |  | ||||||
|     sliceSum(meson_CF,meson_T, Tdir); |  | ||||||
|  |  | ||||||
|     int nt=meson_T.size(); |  | ||||||
|  |  | ||||||
|     std::vector<Complex> corr(nt); |  | ||||||
|     for(int t=0;t<nt;t++){ |  | ||||||
|       corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around |  | ||||||
|       std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *real(corr[t])<<std::endl; |  | ||||||
|     } |  | ||||||
|     MF.data.push_back(corr); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   { |  | ||||||
|     XmlWriter WR(file); |  | ||||||
|     write(WR,"MesonFile",MF); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| int main (int argc, char ** argv) |  | ||||||
| { |  | ||||||
|   const int Ls=8; |  | ||||||
|  |  | ||||||
|   Grid_init(&argc,&argv); |  | ||||||
|  |  | ||||||
|   // Double precision grids |  | ||||||
|   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),  |  | ||||||
| 								   GridDefaultSimd(Nd,vComplex::Nsimd()), |  | ||||||
| 								   GridDefaultMpi()); |  | ||||||
|   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); |  | ||||||
|   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); |  | ||||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); |  | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////////////////////////////// |  | ||||||
|   // You can manage seeds however you like. |  | ||||||
|   // Recommend SeedUniqueString. |  | ||||||
|   ////////////////////////////////////////////////////////////////////// |  | ||||||
|   //  std::vector<int> seeds4({1,2,3,4});  |  | ||||||
|   //  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); |  | ||||||
|  |  | ||||||
|   LatticeGaugeField Umu(UGrid); |  | ||||||
|   std::string config; |  | ||||||
|   RealD M5=atof(getenv("M5")); |  | ||||||
|   RealD mq = atof(getenv("mass")); |  | ||||||
|   std::vector<RealD> masses({ mq} ); // u/d, s, c ?? |  | ||||||
|   if( argc > 1 && argv[1][0] != '-' ) |  | ||||||
|   { |  | ||||||
|     std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl; |  | ||||||
|     FieldMetaData header; |  | ||||||
|     NerscIO::readConfiguration(Umu, header, argv[1]); |  | ||||||
|     config=argv[1]; |  | ||||||
|     LLscale = 1.0; |  | ||||||
|     LCscale = 1.0; |  | ||||||
|   } |  | ||||||
|   else |  | ||||||
|   { |  | ||||||
|     SU<Nc>::ColdConfiguration(Umu); |  | ||||||
|     config="ColdConfig"; |  | ||||||
|     //    RealD P=1.0; // Don't scale |  | ||||||
|     //    RealD P=0.6153342; // 64I |  | ||||||
|     //    RealD P=0.6388238 // 32Ifine |  | ||||||
|     //    RealD P=0.5871119; // 48I |  | ||||||
|     //    RealD u0 = sqrt(sqrt(P)); |  | ||||||
|     //    Umu = Umu * u0; |  | ||||||
|     RealD w0 = 1 - M5; |  | ||||||
|     LLscale = 1.0/(1-w0*w0)/(1-w0*w0); |  | ||||||
|     LCscale = 1.0/(1-w0*w0)/(1-w0*w0); |  | ||||||
|     std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage <<"M5 =  "<<M5<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage <<"mq =  "<<mq<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage <<"LLscale =  "<<LLscale<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage <<"LCscale =  "<<LCscale<<std::endl; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   int nmass = masses.size(); |  | ||||||
|  |  | ||||||
|   std::vector<DomainWallFermionD *> FermActs; |  | ||||||
|    |  | ||||||
|   std::cout<<GridLogMessage <<"======================"<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage <<"DomainWallFermion action"<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage <<"======================"<<std::endl; |  | ||||||
|  |  | ||||||
|   for(auto mass: masses) { |  | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage <<"Making DomainWallFermion action"<<std::endl; |  | ||||||
|     FermActs.push_back(new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5)); |  | ||||||
|     std::cout<<GridLogMessage <<"Made DomainWallFermion action"<<std::endl; |  | ||||||
|     |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   LatticePropagator point_source(UGrid); |  | ||||||
|  |  | ||||||
|   Coordinate Origin({0,0,0,0}); |  | ||||||
|   PointSource   (Origin,point_source); |  | ||||||
|    |  | ||||||
|   //  std::vector<LatticePropagator> PointProps(nmass,UGrid); |  | ||||||
|   std::vector<LatticePropagator> FreeProps(nmass,UGrid); |  | ||||||
|   LatticePropagator delta(UGrid); |  | ||||||
|  |  | ||||||
|   for(int m=0;m<nmass;m++) { |  | ||||||
|     //    Solve(*FermActs[m],point_source   ,PointProps[m]); |  | ||||||
|     MasslessFreePropagator(*FermActs[m],point_source   ,FreeProps[m]); |  | ||||||
|  |  | ||||||
|     //    delta = PointProps[m] - FreeProps[m]; |  | ||||||
|     //    std::cout << " delta "<<norm2(delta) << " FFT "<<norm2(FreeProps[m])<< " CG " <<norm2(PointProps[m])<<std::endl; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   LatticeComplex phase(UGrid); |  | ||||||
|   Coordinate mom({0,0,0,0}); |  | ||||||
|   MakePhase(mom,phase); |  | ||||||
|    |  | ||||||
|   for(int m1=0 ;m1<nmass;m1++) { |  | ||||||
|   for(int m2=m1;m2<nmass;m2++) { |  | ||||||
|     std::stringstream ssp,ssg,ssz; |  | ||||||
|  |  | ||||||
|     ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml"; |  | ||||||
|     ssz<<config<< "_m" << m1 << "_m"<< m2 << "_free_meson.xml"; |  | ||||||
|  |  | ||||||
|     //    std::cout << "CG determined VV correlation function"<<std::endl; |  | ||||||
|     //    MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase); |  | ||||||
|      |  | ||||||
|     std::cout << "FFT derived VV correlation function"<<std::endl; |  | ||||||
|     MesonTrace(ssz.str(),FreeProps[m1],FreeProps[m2],phase); |  | ||||||
|   }} |  | ||||||
|  |  | ||||||
|   Grid_finalize(); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
							
								
								
									
										23
									
								
								systems/Aurora-AOT/config-command
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								systems/Aurora-AOT/config-command
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | |||||||
|  | #Ahead of time compile for PVC | ||||||
|  | export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl "  | ||||||
|  | export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc " | ||||||
|  |  | ||||||
|  | #JIT compile  | ||||||
|  | #export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel  -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel  -fsycl  -lsycl "  | ||||||
|  | #export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel  -fsycl -fno-exceptions " | ||||||
|  |  | ||||||
|  | ../../configure \ | ||||||
|  | 	--enable-simd=GPU \ | ||||||
|  | 	--enable-gen-simd-width=64 \ | ||||||
|  | 	--enable-comms=mpi-auto \ | ||||||
|  | 	--enable-debug \ | ||||||
|  | 	--disable-gparity \ | ||||||
|  | 	--disable-fermion-reps \ | ||||||
|  | 	--with-lime=$CLIME \ | ||||||
|  | 	--enable-shm=nvlink \ | ||||||
|  | 	--enable-accelerator=sycl \ | ||||||
|  | 	--enable-accelerator-aware-mpi=yes\ | ||||||
|  | 	--enable-unified=no \ | ||||||
|  | 	MPICXX=mpicxx \ | ||||||
|  | 	CXX=icpx  | ||||||
|  |  | ||||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user