mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-27 18:19:34 +00:00 
			
		
		
		
	Compare commits
	
		
			44 Commits
		
	
	
		
			feature/di
			...
			feature/fe
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 97a098636d | ||
|  | e13930c8b2 | ||
|  | 0655dab466 | ||
|  | 7f097bcc28 | ||
|  | 5c75aa5008 | ||
|  | 1873101362 | ||
|  | 63fd1dfa62 | ||
|  | bd68861b28 | ||
|  | 82e959f66c | ||
|  | 62e52de06d | ||
| 184adeedb8 | |||
| 5fa6a8b96d | |||
| a2a879b668 | |||
| 9317d893b2 | |||
| 86075fdd45 | |||
| b36442e263 | |||
| 513d797ea6 | |||
| 9e4835a3e3 | |||
|  | 477ebf24f4 | ||
|  | 0d5639f707 | ||
|  | 413312f9a9 | ||
|  | 03508448f8 | ||
|  | e1e5c75023 | ||
|  | 9296299b61 | ||
|  | 913fbca74a | ||
|  | 60dfb49afa | ||
|  | 554c238359 | ||
|  | f922adf05e | ||
|  | 188d2c7a4d | ||
|  | 17d7177105 | ||
|  | bb0a0da47a | ||
|  | 84110166e4 | ||
|  | d32b923b6c | ||
|  | 2ab1af5754 | ||
|  | 5f8892bf03 | ||
|  | f14e7e51e7 | ||
|  | 042ab1a052 | ||
|  | 2df98a99bc | ||
|  | 315ea18be2 | ||
|  | a9c2e1df03 | ||
| da4daea57a | |||
| e346154c5d | |||
| 3ca0de1c40 | |||
| c7205d2a73 | 
| @@ -16,6 +16,7 @@ | ||||
| #include <functional> | ||||
| #include <stdio.h> | ||||
| #include <stdlib.h> | ||||
| #include <strings.h> | ||||
| #include <stdio.h> | ||||
| #include <signal.h> | ||||
| #include <ctime> | ||||
|   | ||||
| @@ -262,7 +262,7 @@ public: | ||||
| 	autoView( Tnp_v , (*Tnp), AcceleratorWrite); | ||||
| 	autoView( Tnm_v , (*Tnm), AcceleratorWrite); | ||||
| 	const int Nsimd = CComplex::Nsimd(); | ||||
| 	accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { | ||||
| 	accelerator_for(ss, FineGrid->oSites(), Nsimd, { | ||||
| 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); | ||||
| 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); | ||||
|         }); | ||||
|   | ||||
| @@ -264,7 +264,7 @@ public: | ||||
|       auto Tnp_v = Tnp->View(); | ||||
|       auto Tnm_v = Tnm->View(); | ||||
|       constexpr int Nsimd = vector_type::Nsimd(); | ||||
|       accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, { | ||||
|       accelerator_for(ss, in.Grid()->oSites(), Nsimd, { | ||||
| 	  coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); | ||||
| 	  coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); | ||||
|       }); | ||||
|   | ||||
| @@ -392,9 +392,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | ||||
|     acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); | ||||
|   } | ||||
|  | ||||
|   if ( CommunicatorPolicy == CommunicatorPolicySequential ) { | ||||
|     this->StencilSendToRecvFromComplete(list,dir); | ||||
|   } | ||||
|   //  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { | ||||
|   //    this->StencilSendToRecvFromComplete(list,dir); | ||||
|   //  } | ||||
|  | ||||
|   return off_node_bytes; | ||||
| } | ||||
|   | ||||
							
								
								
									
										24165
									
								
								Grid/json/json.hpp
									
									
									
									
									
								
							
							
						
						
									
										24165
									
								
								Grid/json/json.hpp
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -28,6 +28,9 @@ Author: Christoph Lehner <christoph@lhnr.de> | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP) | ||||
| #include <Grid/lattice/Lattice_reduction_gpu.h> | ||||
| #endif | ||||
| #if defined(GRID_SYCL) | ||||
| #include <Grid/lattice/Lattice_reduction_sycl.h> | ||||
| #endif | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| @@ -127,7 +130,7 @@ inline Double max(const Double *arg, Integer osites) | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) | ||||
| { | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP) | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | ||||
|   return sum_gpu(arg,osites); | ||||
| #else | ||||
|   return sum_cpu(arg,osites); | ||||
| @@ -136,7 +139,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) | ||||
| { | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP) | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | ||||
|   return sumD_gpu(arg,osites); | ||||
| #else | ||||
|   return sumD_cpu(arg,osites); | ||||
| @@ -145,7 +148,7 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) | ||||
| { | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP) | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | ||||
|   return sumD_gpu_large(arg,osites); | ||||
| #else | ||||
|   return sumD_cpu(arg,osites); | ||||
| @@ -155,13 +158,13 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||
| { | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP) | ||||
|   autoView( arg_v, arg, AcceleratorRead); | ||||
|   Integer osites = arg.Grid()->oSites(); | ||||
|   auto ssum= sum_gpu(&arg_v[0],osites); | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | ||||
|   typename vobj::scalar_object ssum; | ||||
|   autoView( arg_v, arg, AcceleratorRead); | ||||
|   ssum= sum_gpu(&arg_v[0],osites); | ||||
| #else | ||||
|   autoView(arg_v, arg, CpuRead); | ||||
|   Integer osites = arg.Grid()->oSites(); | ||||
|   auto ssum= sum_cpu(&arg_v[0],osites); | ||||
| #endif   | ||||
|   arg.Grid()->GlobalSum(ssum); | ||||
| @@ -171,7 +174,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) | ||||
| template<class vobj> | ||||
| inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg) | ||||
| { | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP) | ||||
| #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) | ||||
|   autoView( arg_v, arg, AcceleratorRead); | ||||
|   Integer osites = arg.Grid()->oSites(); | ||||
|   auto ssum= sum_gpu_large(&arg_v[0],osites); | ||||
| @@ -235,11 +238,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> & | ||||
|   typedef decltype(innerProductD(vobj(),vobj())) inner_t; | ||||
|   Vector<inner_t> inner_tmp(sites); | ||||
|   auto inner_tmp_v = &inner_tmp[0]; | ||||
|      | ||||
|   { | ||||
|     autoView( left_v , left, AcceleratorRead); | ||||
|     autoView( right_v,right, AcceleratorRead); | ||||
|  | ||||
|     // This code could read coalesce | ||||
|     // GPU - SIMT lane compliance... | ||||
|     accelerator_for( ss, sites, 1,{ | ||||
| 	auto x_l = left_v[ss]; | ||||
|   | ||||
							
								
								
									
										125
									
								
								Grid/lattice/Lattice_reduction_sycl.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										125
									
								
								Grid/lattice/Lattice_reduction_sycl.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,125 @@ | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Possibly promote to double and sum | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
| template <class vobj> | ||||
| inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)  | ||||
| { | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   typedef typename vobj::scalar_objectD sobjD; | ||||
|   sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator); | ||||
|   sobj identity; zeroit(identity); | ||||
|   sobj ret ;  | ||||
|  | ||||
|   Integer nsimd= vobj::Nsimd(); | ||||
|    | ||||
|   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||
|      auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>()); | ||||
|      cgh.parallel_for(cl::sycl::range<1>{osites}, | ||||
| 		      Reduction, | ||||
| 		      [=] (cl::sycl::id<1> item, auto &sum) { | ||||
|       auto osite   = item[0]; | ||||
|       sum +=Reduce(lat[osite]); | ||||
|      }); | ||||
|    }); | ||||
|   theGridAccelerator->wait(); | ||||
|   ret = mysum[0]; | ||||
|   free(mysum,*theGridAccelerator); | ||||
|   sobjD dret; convertType(dret,ret); | ||||
|   return dret; | ||||
| } | ||||
|  | ||||
| template <class vobj> | ||||
| inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites) | ||||
| { | ||||
|   return sumD_gpu_tensor(lat,osites); | ||||
| } | ||||
| template <class vobj> | ||||
| inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) | ||||
| { | ||||
|   return sumD_gpu_large(lat,osites); | ||||
| } | ||||
|  | ||||
| template <class vobj> | ||||
| inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) | ||||
| { | ||||
|   return sumD_gpu_large(lat,osites); | ||||
| } | ||||
|  | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| // Return as same precision as input performing reduction in double precision though | ||||
| ///////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||
| template <class vobj> | ||||
| inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)  | ||||
| { | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   sobj result; | ||||
|   result = sumD_gpu(lat,osites); | ||||
|   return result; | ||||
| } | ||||
|  | ||||
| template <class vobj> | ||||
| inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites) | ||||
| { | ||||
|   typedef typename vobj::scalar_object sobj; | ||||
|   sobj result; | ||||
|   result = sumD_gpu_large(lat,osites); | ||||
|   return result; | ||||
| } | ||||
|  | ||||
| NAMESPACE_END(Grid); | ||||
|  | ||||
| /* | ||||
| template<class Double> Double svm_reduce(Double *vec,uint64_t L) | ||||
| { | ||||
|   Double sumResult; zeroit(sumResult); | ||||
|   Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator); | ||||
|   Double identity;  zeroit(identity); | ||||
|   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||
|      auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>()); | ||||
|      cgh.parallel_for(cl::sycl::range<1>{L}, | ||||
| 		      Reduction, | ||||
| 		      [=] (cl::sycl::id<1> index, auto &sum) { | ||||
| 	 sum +=vec[index]; | ||||
|      }); | ||||
|    }); | ||||
|   theGridAccelerator->wait(); | ||||
|   Double ret = d_sum[0]; | ||||
|   free(d_sum,*theGridAccelerator); | ||||
|   std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl; | ||||
|   return ret; | ||||
| } | ||||
|  | ||||
| template <class vobj> | ||||
| inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites) | ||||
| { | ||||
|   typedef typename vobj::vector_type  vector; | ||||
|   typedef typename vobj::scalar_type  scalar; | ||||
|  | ||||
|   typedef typename vobj::scalar_typeD scalarD; | ||||
|   typedef typename vobj::scalar_objectD sobjD; | ||||
|  | ||||
|   sobjD ret; | ||||
|   scalarD *ret_p = (scalarD *)&ret; | ||||
|    | ||||
|   const int nsimd = vobj::Nsimd(); | ||||
|   const int words = sizeof(vobj)/sizeof(vector); | ||||
|  | ||||
|   Vector<scalar> buffer(osites*nsimd); | ||||
|   scalar *buf = &buffer[0]; | ||||
|   vector *dat = (vector *)lat; | ||||
|  | ||||
|   for(int w=0;w<words;w++) { | ||||
|  | ||||
|     accelerator_for(ss,osites,nsimd,{ | ||||
| 	int lane = acceleratorSIMTlane(nsimd); | ||||
| 	buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane); | ||||
|     }); | ||||
|     //Precision change at this point is to late to gain precision | ||||
|     ret_p[w] = svm_reduce(buf,nsimd*osites); | ||||
|   } | ||||
|   return ret; | ||||
| } | ||||
| */ | ||||
| @@ -451,9 +451,20 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz | ||||
| // Fermion <-> propagator assignements | ||||
| ////////////////////////////////////////////// | ||||
| //template <class Prop, class Ferm> | ||||
| #define FAST_FERM_TO_PROP | ||||
| template <class Fimpl> | ||||
| void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c) | ||||
| { | ||||
| #ifdef FAST_FERM_TO_PROP | ||||
|   autoView(p_v,p,AcceleratorWrite); | ||||
|   autoView(f_v,f,AcceleratorRead); | ||||
|   accelerator_for(idx,p_v.oSites(),1,{ | ||||
|       for(int ss = 0; ss < Ns; ++ss) { | ||||
|       for(int cc = 0; cc < Fimpl::Dimension; ++cc) { | ||||
| 	p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.) | ||||
|       }} | ||||
|     }); | ||||
| #else | ||||
|   for(int j = 0; j < Ns; ++j) | ||||
|     { | ||||
|       auto pjs = peekSpin(p, j, s); | ||||
| @@ -465,12 +476,23 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio | ||||
| 	} | ||||
|       pokeSpin(p, pjs, j, s); | ||||
|     } | ||||
| #endif | ||||
| } | ||||
|      | ||||
| //template <class Prop, class Ferm> | ||||
| template <class Fimpl> | ||||
| void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c) | ||||
| { | ||||
| #ifdef FAST_FERM_TO_PROP | ||||
|   autoView(p_v,p,AcceleratorWrite); | ||||
|   autoView(f_v,f,AcceleratorRead); | ||||
|   accelerator_for(idx,p_v.oSites(),1,{ | ||||
|       for(int ss = 0; ss < Ns; ++ss) { | ||||
|       for(int cc = 0; cc < Fimpl::Dimension; ++cc) { | ||||
| 	f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index | ||||
|       }} | ||||
|     }); | ||||
| #else | ||||
|   for(int j = 0; j < Ns; ++j) | ||||
|     { | ||||
|       auto pjs = peekSpin(p, j, s); | ||||
| @@ -482,6 +504,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato | ||||
| 	} | ||||
|       pokeSpin(f, fj, j); | ||||
|     } | ||||
| #endif | ||||
| } | ||||
|      | ||||
| ////////////////////////////////////////////// | ||||
|   | ||||
| @@ -204,15 +204,18 @@ public: | ||||
|   typedef WilsonCloverHelpers<Impl> Helpers; | ||||
|   typedef CompactWilsonCloverHelpers<Impl> CompactHelpers; | ||||
|  | ||||
|   static void MassTerm(CloverField& Clover, RealD diag_mass) { | ||||
|   static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) { | ||||
|     Clover += diag_mass; | ||||
|   } | ||||
|  | ||||
|   static void Exponentiate_Clover(CloverDiagonalField& Diagonal, | ||||
|                           CloverTriangleField& Triangle, | ||||
|                           RealD csw_t, RealD diag_mass) { | ||||
|   static void InvertClover(CloverField& InvClover, | ||||
|                             const CloverDiagonalField& diagonal, | ||||
|                             const CloverTriangleField& triangle, | ||||
|                             CloverDiagonalField&       diagonalInv, | ||||
|                             CloverTriangleField&       triangleInv, | ||||
|                             bool fixedBoundaries) { | ||||
|  | ||||
|     // Do nothing | ||||
|     CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv); | ||||
|   } | ||||
|  | ||||
|   // TODO: implement Cmunu for better performances with compact layout, but don't do it | ||||
| @@ -237,9 +240,17 @@ public: | ||||
|   template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>; | ||||
|   typedef CompactWilsonCloverHelpers<Impl> CompactHelpers; | ||||
|  | ||||
|   static void MassTerm(CloverField& Clover, RealD diag_mass) { | ||||
|     // do nothing! | ||||
|     // mass term is multiplied to exp(Clover) below | ||||
|   // Can this be avoided? | ||||
|   static void IdentityTimesC(const CloverField& in, RealD c) { | ||||
|     int DimRep = Impl::Dimension; | ||||
|  | ||||
|     autoView(in_v, in, AcceleratorWrite); | ||||
|  | ||||
|     accelerator_for(ss, in.Grid()->oSites(), 1, { | ||||
|       for (int sa=0; sa<Ns; sa++) | ||||
|         for (int ca=0; ca<DimRep; ca++) | ||||
|           in_v[ss]()(sa,sa)(ca,ca) = c; | ||||
|     }); | ||||
|   } | ||||
|  | ||||
|   static int getNMAX(RealD prec, RealD R) { | ||||
| @@ -254,175 +265,62 @@ public: | ||||
|     return NMAX; | ||||
|   } | ||||
|  | ||||
|   static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);} | ||||
|   static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);} | ||||
|   static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);} | ||||
|   static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);} | ||||
|  | ||||
|   static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){ | ||||
|   static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) { | ||||
|  | ||||
|   	  typedef iMatrix<ComplexD,6> mat; | ||||
|     GridBase* grid = Clover.Grid(); | ||||
|     CloverField ExpClover(grid); | ||||
|  | ||||
|   	  RealD qn[6]; | ||||
|   	  RealD qnold[6]; | ||||
|   	  RealD p[5]; | ||||
|   	  RealD trA2, trA3, trA4; | ||||
|     int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass); | ||||
|  | ||||
|   	  mat A2, A3, A4, A5; | ||||
|   	  A2 = alpha * alpha * arg * arg; | ||||
|   	  A3 = alpha * arg * A2; | ||||
|   	  A4 = A2 * A2; | ||||
|   	  A5 = A2 * A3; | ||||
|     Clover *= (1.0/diag_mass); | ||||
|  | ||||
|   	  trA2 = toReal( trace(A2) ); | ||||
|   	  trA3 = toReal( trace(A3) ); | ||||
|   	  trA4 = toReal( trace(A4)); | ||||
|  | ||||
|   	  p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0; | ||||
|   	  p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0; | ||||
|   	  p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2; | ||||
|   	  p[3] = trA3 / 3.0; | ||||
|   	  p[4] = 0.5 * trA2; | ||||
|  | ||||
|   	  qnold[0] = cN[Niter]; | ||||
|   	  qnold[1] = 0.0; | ||||
|   	  qnold[2] = 0.0; | ||||
|   	  qnold[3] = 0.0; | ||||
|   	  qnold[4] = 0.0; | ||||
|   	  qnold[5] = 0.0; | ||||
|  | ||||
|   	  for(int i = Niter-1; i >= 0; i--) | ||||
|   	  { | ||||
|   	   qn[0] = p[0] * qnold[5] + cN[i]; | ||||
|   	   qn[1] = p[1] * qnold[5] + qnold[0]; | ||||
|   	   qn[2] = p[2] * qnold[5] + qnold[1]; | ||||
|   	   qn[3] = p[3] * qnold[5] + qnold[2]; | ||||
|   	   qn[4] = p[4] * qnold[5] + qnold[3]; | ||||
|   	   qn[5] = qnold[4]; | ||||
|  | ||||
|   	   qnold[0] = qn[0]; | ||||
|   	   qnold[1] = qn[1]; | ||||
|   	   qnold[2] = qn[2]; | ||||
|   	   qnold[3] = qn[3]; | ||||
|   	   qnold[4] = qn[4]; | ||||
|   	   qnold[5] = qn[5]; | ||||
|   	  } | ||||
|  | ||||
|   	  mat unit(1.0); | ||||
|  | ||||
|   	  dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5); | ||||
|  | ||||
|     } | ||||
|  | ||||
|   static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) { | ||||
|  | ||||
|     GridBase* grid = Diagonal.Grid(); | ||||
|     int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass); | ||||
|  | ||||
|     // | ||||
|     // Implementation completely in Daniel's layout | ||||
|     // | ||||
|  | ||||
|     // Taylor expansion with Cayley-Hamilton recursion | ||||
|     // underlying Horner scheme as above | ||||
|     // Taylor expansion, slow but generic | ||||
|     // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...)) | ||||
|     // qN = cN | ||||
|     // qn = cn + qn+1 X | ||||
|     std::vector<RealD> cn(NMAX+1); | ||||
|     cn[0] = 1.0; | ||||
|     for (int i=1; i<=NMAX; i++){ | ||||
|     for (int i=1; i<=NMAX; i++) | ||||
|       cn[i] = cn[i-1] / RealD(i); | ||||
|     } | ||||
|  | ||||
|       // Taken over from Daniel's implementation | ||||
|       conformable(Diagonal, Triangle); | ||||
|     ExpClover = Zero(); | ||||
|     IdentityTimesC(ExpClover, cn[NMAX]); | ||||
|     for (int i=NMAX-1; i>=0; i--) | ||||
|       ExpClover = ExpClover * Clover + cn[i]; | ||||
|  | ||||
|       long lsites = grid->lSites(); | ||||
|     { | ||||
|       typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal; | ||||
|       typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle; | ||||
|       typedef iMatrix<ComplexD,6> mat; | ||||
|     // prepare inverse | ||||
|     CloverInv = (-1.0)*Clover; | ||||
|  | ||||
|       autoView(diagonal_v,  Diagonal,  CpuRead); | ||||
|       autoView(triangle_v,  Triangle,  CpuRead); | ||||
|       autoView(diagonalExp_v, Diagonal, CpuWrite); | ||||
|       autoView(triangleExp_v, Triangle, CpuWrite); | ||||
|     Clover = ExpClover * diag_mass; | ||||
|  | ||||
|       thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite | ||||
|     ExpClover = Zero(); | ||||
|     IdentityTimesC(ExpClover, cn[NMAX]); | ||||
|     for (int i=NMAX-1; i>=0; i--) | ||||
|       ExpClover = ExpClover * CloverInv + cn[i]; | ||||
|  | ||||
|     	  mat srcCloverOpUL(0.0); // upper left block | ||||
|     	  mat srcCloverOpLR(0.0); // lower right block | ||||
|     	  mat ExpCloverOp; | ||||
|     CloverInv = ExpClover * (1.0/diag_mass); | ||||
|  | ||||
|         scalar_object_diagonal diagonal_tmp     = Zero(); | ||||
|         scalar_object_diagonal diagonal_exp_tmp = Zero(); | ||||
|         scalar_object_triangle triangle_tmp     = Zero(); | ||||
|         scalar_object_triangle triangle_exp_tmp = Zero(); | ||||
|  | ||||
|         Coordinate lcoor; | ||||
|         grid->LocalIndexToLocalCoor(site, lcoor); | ||||
|  | ||||
|         peekLocalSite(diagonal_tmp, diagonal_v, lcoor); | ||||
|         peekLocalSite(triangle_tmp, triangle_v, lcoor); | ||||
|  | ||||
|         int block; | ||||
|         block = 0; | ||||
|         for(int i = 0; i < 6; i++){ | ||||
|         	for(int j = 0; j < 6; j++){ | ||||
|         		if (i == j){ | ||||
|         			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i))); | ||||
|         		} | ||||
|         		else{ | ||||
|         			srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j))); | ||||
|         		} | ||||
|         	} | ||||
|         } | ||||
|         block = 1; | ||||
|         for(int i = 0; i < 6; i++){ | ||||
|           	for(int j = 0; j < 6; j++){ | ||||
|            		if (i == j){ | ||||
|            			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i))); | ||||
|            		} | ||||
|            		else{ | ||||
|            			srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j))); | ||||
|            		} | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // exp(Clover) | ||||
|  | ||||
|         ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp); | ||||
|  | ||||
|         block = 0; | ||||
|         for(int i = 0; i < 6; i++){ | ||||
|         	for(int j = 0; j < 6; j++){ | ||||
|             	if (i == j){ | ||||
|             		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j); | ||||
|             	} | ||||
|             	else if(i < j){ | ||||
|             		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j); | ||||
|             	} | ||||
|            	} | ||||
|         } | ||||
|  | ||||
|         ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp); | ||||
|  | ||||
|         block = 1; | ||||
|         for(int i = 0; i < 6; i++){ | ||||
|         	for(int j = 0; j < 6; j++){ | ||||
|               	if (i == j){ | ||||
|               		diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j); | ||||
|                	} | ||||
|                	else if(i < j){ | ||||
|                		triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j); | ||||
|                	} | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor); | ||||
|         pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor); | ||||
|       }); | ||||
|     } | ||||
|  | ||||
|     Diagonal *= diag_mass; | ||||
|     Triangle *= diag_mass; | ||||
|   } | ||||
|  | ||||
|   static void InvertClover(CloverField& InvClover, | ||||
|                             const CloverDiagonalField& diagonal, | ||||
|                             const CloverTriangleField& triangle, | ||||
|                             CloverDiagonalField&       diagonalInv, | ||||
|                             CloverTriangleField&       triangleInv, | ||||
|                             bool fixedBoundaries) { | ||||
|  | ||||
|     if (fixedBoundaries) | ||||
|     { | ||||
|       CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv); | ||||
|     } | ||||
|     else | ||||
|     { | ||||
|       CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) { | ||||
|     assert(0); | ||||
|   | ||||
| @@ -225,7 +225,7 @@ public: | ||||
|   RealD csw_t; | ||||
|   RealD cF; | ||||
|  | ||||
|   bool open_boundaries; | ||||
|   bool fixedBoundaries; | ||||
|  | ||||
|   CloverDiagonalField Diagonal,    DiagonalEven,    DiagonalOdd; | ||||
|   CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd; | ||||
|   | ||||
| @@ -117,19 +117,19 @@ public: | ||||
|     typedef decltype(coalescedRead(*in))    sobj; | ||||
|     typedef decltype(coalescedRead(*out0)) hsobj; | ||||
|  | ||||
|     unsigned int Nsimd = vobj::Nsimd(); | ||||
|     constexpr unsigned int Nsimd = vobj::Nsimd(); | ||||
|     unsigned int mask = Nsimd >> (type + 1); | ||||
|     int lane = acceleratorSIMTlane(Nsimd); | ||||
|     int j0 = lane &(~mask); // inner coor zero | ||||
|     int j1 = lane |(mask) ; // inner coor one | ||||
|     const vobj *vp0 = &in[k]; | ||||
|     const vobj *vp1 = &in[m]; | ||||
|     const vobj *vp = (lane&mask) ? vp1:vp0; | ||||
|     auto sa = coalescedRead(*vp,j0); | ||||
|     auto sb = coalescedRead(*vp,j1); | ||||
|     const vobj *vp0 = &in[k];  // out0[j] = merge low bit of type from in[k] and in[m]  | ||||
|     const vobj *vp1 = &in[m];  // out1[j] = merge hi  bit of type from in[k] and in[m] | ||||
|     const vobj *vp = (lane&mask) ? vp1:vp0;// if my lane has high bit take vp1, low bit take vp0 | ||||
|     auto sa = coalescedRead(*vp,j0); // lane to read for out 0, NB 50% read coalescing | ||||
|     auto sb = coalescedRead(*vp,j1); // lane to read for out 1 | ||||
|     hsobj psa, psb; | ||||
|     projector::Proj(psa,sa,mu,dag); | ||||
|     projector::Proj(psb,sb,mu,dag); | ||||
|     projector::Proj(psa,sa,mu,dag);  // spin project the result0 | ||||
|     projector::Proj(psb,sb,mu,dag);  // spin project the result1 | ||||
|     coalescedWrite(out0[j],psa); | ||||
|     coalescedWrite(out1[j],psb); | ||||
| #else | ||||
|   | ||||
| @@ -48,7 +48,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug | ||||
|   , csw_r(_csw_r) | ||||
|   , csw_t(_csw_t) | ||||
|   , cF(_cF) | ||||
|   , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0) | ||||
|   , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0) | ||||
|   , Diagonal(&Fgrid),        Triangle(&Fgrid) | ||||
|   , DiagonalEven(&Hgrid),    TriangleEven(&Hgrid) | ||||
|   , DiagonalOdd(&Hgrid),     TriangleOdd(&Hgrid) | ||||
| @@ -67,7 +67,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug | ||||
|     csw_r /= clover_anisotropy.xi_0; | ||||
|  | ||||
|   ImportGauge(_Umu); | ||||
|   if (open_boundaries) { | ||||
|   if (fixedBoundaries) { | ||||
|     this->BoundaryMaskEven.Checkerboard() = Even; | ||||
|     this->BoundaryMaskOdd.Checkerboard() = Odd; | ||||
|     CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd); | ||||
| @@ -77,31 +77,31 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug | ||||
| template<class Impl, class CloverHelpers> | ||||
| void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) { | ||||
|   WilsonBase::Dhop(in, out, dag); | ||||
|   if(open_boundaries) ApplyBoundaryMask(out); | ||||
|   if(fixedBoundaries) ApplyBoundaryMask(out); | ||||
| } | ||||
|  | ||||
| template<class Impl, class CloverHelpers> | ||||
| void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) { | ||||
|   WilsonBase::DhopOE(in, out, dag); | ||||
|   if(open_boundaries) ApplyBoundaryMask(out); | ||||
|   if(fixedBoundaries) ApplyBoundaryMask(out); | ||||
| } | ||||
|  | ||||
| template<class Impl, class CloverHelpers> | ||||
| void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) { | ||||
|   WilsonBase::DhopEO(in, out, dag); | ||||
|   if(open_boundaries) ApplyBoundaryMask(out); | ||||
|   if(fixedBoundaries) ApplyBoundaryMask(out); | ||||
| } | ||||
|  | ||||
| template<class Impl, class CloverHelpers> | ||||
| void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) { | ||||
|   WilsonBase::DhopDir(in, out, dir, disp); | ||||
|   if(this->open_boundaries) ApplyBoundaryMask(out); | ||||
|   if(this->fixedBoundaries) ApplyBoundaryMask(out); | ||||
| } | ||||
|  | ||||
| template<class Impl, class CloverHelpers> | ||||
| void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) { | ||||
|   WilsonBase::DhopDirAll(in, out); | ||||
|   if(this->open_boundaries) { | ||||
|   if(this->fixedBoundaries) { | ||||
|     for(auto& o : out) ApplyBoundaryMask(o); | ||||
|   } | ||||
| } | ||||
| @@ -112,7 +112,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in, | ||||
|   WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc | ||||
|   Mooee(in, Tmp); | ||||
|   axpy(out, 1.0, out, Tmp); | ||||
|   if(open_boundaries) ApplyBoundaryMask(out); | ||||
|   if(fixedBoundaries) ApplyBoundaryMask(out); | ||||
| } | ||||
|  | ||||
| template<class Impl, class CloverHelpers> | ||||
| @@ -121,19 +121,19 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& i | ||||
|   WilsonBase::Dhop(in, out, DaggerYes);  // call base to save applying bc | ||||
|   MooeeDag(in, Tmp); | ||||
|   axpy(out, 1.0, out, Tmp); | ||||
|   if(open_boundaries) ApplyBoundaryMask(out); | ||||
|   if(fixedBoundaries) ApplyBoundaryMask(out); | ||||
| } | ||||
|  | ||||
| template<class Impl, class CloverHelpers> | ||||
| void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) { | ||||
|   WilsonBase::Meooe(in, out); | ||||
|   if(open_boundaries) ApplyBoundaryMask(out); | ||||
|   if(fixedBoundaries) ApplyBoundaryMask(out); | ||||
| } | ||||
|  | ||||
| template<class Impl, class CloverHelpers> | ||||
| void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) { | ||||
|   WilsonBase::MeooeDag(in, out); | ||||
|   if(open_boundaries) ApplyBoundaryMask(out); | ||||
|   if(fixedBoundaries) ApplyBoundaryMask(out); | ||||
| } | ||||
|  | ||||
| template<class Impl, class CloverHelpers> | ||||
| @@ -147,7 +147,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField& | ||||
|   } else { | ||||
|     MooeeInternal(in, out, Diagonal, Triangle); | ||||
|   } | ||||
|   if(open_boundaries) ApplyBoundaryMask(out); | ||||
|   if(fixedBoundaries) ApplyBoundaryMask(out); | ||||
| } | ||||
|  | ||||
| template<class Impl, class CloverHelpers> | ||||
| @@ -166,7 +166,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionFiel | ||||
|   } else { | ||||
|     MooeeInternal(in, out, DiagonalInv, TriangleInv); | ||||
|   } | ||||
|   if(open_boundaries) ApplyBoundaryMask(out); | ||||
|   if(fixedBoundaries) ApplyBoundaryMask(out); | ||||
| } | ||||
|  | ||||
| template<class Impl, class CloverHelpers> | ||||
| @@ -186,7 +186,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField | ||||
|  | ||||
| template<class Impl, class CloverHelpers> | ||||
| void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) { | ||||
|   assert(!open_boundaries); // TODO check for changes required for open bc | ||||
|   assert(!fixedBoundaries); // TODO check for changes required for open bc | ||||
|  | ||||
|   // NOTE: code copied from original clover term | ||||
|   conformable(X.Grid(), Y.Grid()); | ||||
| @@ -305,6 +305,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie | ||||
|   GridBase* grid = _Umu.Grid(); | ||||
|   typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid); | ||||
|   CloverField TmpOriginal(grid); | ||||
|   CloverField TmpInverse(grid); | ||||
|  | ||||
|   // Compute the field strength terms mu>nu | ||||
|   double t2 = usecond(); | ||||
| @@ -324,24 +325,27 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie | ||||
|   TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t; | ||||
|   TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t; | ||||
|   TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t; | ||||
|   // Handle mass term based on clover policy | ||||
|   CloverHelpers::MassTerm(TmpOriginal, this->diag_mass); | ||||
|    | ||||
|   // Convert the data layout of the clover term | ||||
|  | ||||
|   // Instantiate the clover term | ||||
|   // - In case of the standard clover the mass term is added | ||||
|   // - In case of the exponential clover the clover term is exponentiated | ||||
|   double t4 = usecond(); | ||||
|   CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass); | ||||
|  | ||||
|   // Convert the data layout of the clover term | ||||
|   double t5 = usecond(); | ||||
|   CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle); | ||||
|  | ||||
|   // Exponentiate the clover (nothing happens in case of the standard clover) | ||||
|   double t5 = usecond(); | ||||
|   CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass); | ||||
|  | ||||
|   // Possible modify the boundary values | ||||
|   // Modify the clover term at the temporal boundaries in case of open boundary conditions | ||||
|   double t6 = usecond(); | ||||
|   if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass); | ||||
|   if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass); | ||||
|  | ||||
|   // Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions) | ||||
|   // Invert the Clover term | ||||
|   // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved | ||||
|   // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted. | ||||
|   // TODO: For now this inversion is explictly done on the CPU | ||||
|   double t7 = usecond(); | ||||
|   CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv); | ||||
|   CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries); | ||||
|  | ||||
|   // Fill the remaining clover fields | ||||
|   double t8 = usecond(); | ||||
| @@ -362,10 +366,10 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie | ||||
|   std::cout << GridLogDebug << "allocations =                " << (t2 - t1) / 1e6 << std::endl; | ||||
|   std::cout << GridLogDebug << "field strength =             " << (t3 - t2) / 1e6 << std::endl; | ||||
|   std::cout << GridLogDebug << "fill clover =                " << (t4 - t3) / 1e6 << std::endl; | ||||
|   std::cout << GridLogDebug << "convert =                    " << (t5 - t4) / 1e6 << std::endl; | ||||
|   std::cout << GridLogDebug << "exponentiation =             " << (t6 - t5) / 1e6 << std::endl; | ||||
|   std::cout << GridLogDebug << "boundaries =                 " << (t7 - t6) / 1e6 << std::endl; | ||||
|   std::cout << GridLogDebug << "inversions =                 " << (t8 - t7) / 1e6 << std::endl; | ||||
|   std::cout << GridLogDebug << "instantiate clover =         " << (t5 - t4) / 1e6 << std::endl; | ||||
|   std::cout << GridLogDebug << "convert layout =             " << (t6 - t5) / 1e6 << std::endl; | ||||
|   std::cout << GridLogDebug << "modify boundaries =          " << (t7 - t6) / 1e6 << std::endl; | ||||
|   std::cout << GridLogDebug << "invert clover =              " << (t8 - t7) / 1e6 << std::endl; | ||||
|   std::cout << GridLogDebug << "pick cbs =                   " << (t9 - t8) / 1e6 << std::endl; | ||||
|   std::cout << GridLogDebug << "total =                      " << (t9 - t0) / 1e6 << std::endl; | ||||
| } | ||||
|   | ||||
| @@ -498,6 +498,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDag);     return;} | ||||
| #endif | ||||
|      acceleratorFenceComputeStream(); | ||||
|    } else if( interior ) { | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt);    return;} | ||||
| @@ -505,11 +506,13 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagInt);     return;} | ||||
| #endif | ||||
|    } else if( exterior ) { | ||||
|      acceleratorFenceComputeStream(); | ||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} | ||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt);    return;} | ||||
| #ifndef GRID_CUDA | ||||
|      if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteDagExt);     return;} | ||||
| #endif | ||||
|      acceleratorFenceComputeStream(); | ||||
|    } | ||||
|    assert(0 && " Kernel optimisation case not covered "); | ||||
|   } | ||||
|   | ||||
| @@ -49,7 +49,7 @@ NAMESPACE_BEGIN(Grid); | ||||
|      | ||||
|     typedef Lattice<SiteLink>  LinkField; | ||||
|     typedef Lattice<SiteField> Field; | ||||
|     typedef Field              ComplexField; | ||||
|     typedef LinkField          ComplexField; | ||||
|   }; | ||||
|    | ||||
|   typedef QedGImpl<vComplex> QedGImplR; | ||||
|   | ||||
| @@ -26,7 +26,7 @@ | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) | ||||
| #ifndef GRID_HIP | ||||
|  | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| @@ -82,7 +82,7 @@ void JSONWriter::writeDefault(const std::string &s,	const std::string &x) | ||||
|   if (s.size()) | ||||
|     ss_ << "\""<< s << "\" : \"" << os.str() << "\" ," ; | ||||
|   else | ||||
|     ss_ << os.str() << " ," ; | ||||
|     ss_ << "\""<< os.str() << "\" ," ; | ||||
| } | ||||
|  | ||||
| // Reader implementation /////////////////////////////////////////////////////// | ||||
|   | ||||
| @@ -54,7 +54,7 @@ namespace Grid | ||||
|     void pop(void); | ||||
|     template <typename U> | ||||
|     void writeDefault(const std::string &s, const U &x); | ||||
| #ifdef __NVCC__ | ||||
| #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||
|     void writeDefault(const std::string &s, const Grid::ComplexD &x)  | ||||
|     {  | ||||
|       std::complex<double> z(real(x),imag(x)); | ||||
| @@ -101,7 +101,7 @@ namespace Grid | ||||
|     void readDefault(const std::string &s, std::vector<U> &output); | ||||
|     template <typename U, typename P> | ||||
|     void readDefault(const std::string &s, std::pair<U,P> &output); | ||||
| #ifdef __NVCC__ | ||||
| #if defined(GRID_CUDA) || defined(GRID_HIP) | ||||
|     void readDefault(const std::string &s, ComplexD &output) | ||||
|     {  | ||||
|       std::complex<double> z; | ||||
|   | ||||
| @@ -36,7 +36,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
| #include "BinaryIO.h" | ||||
| #include "TextIO.h" | ||||
| #include "XmlIO.h" | ||||
| #if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) | ||||
| #ifndef GRID_HIP | ||||
| #include "JSON_IO.h" | ||||
| #endif | ||||
|  | ||||
|   | ||||
| @@ -80,11 +80,14 @@ void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lat | ||||
| /////////////////////////////////////////////////////////////////// | ||||
| template<class cobj,class vobj,class compressor> | ||||
| void Gather_plane_exchange_table(const Lattice<vobj> &rhs, | ||||
| 				 commVector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline)); | ||||
| 				 commVector<cobj *> pointers, | ||||
| 				 int dimension,int plane, | ||||
| 				 int cbmask,compressor &compress,int type) __attribute__((noinline)); | ||||
|  | ||||
| template<class cobj,class vobj,class compressor> | ||||
| void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs, | ||||
| 				 Vector<cobj *> pointers,int dimension,int plane,int cbmask, | ||||
| void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table, | ||||
| 				 const Lattice<vobj> &rhs, | ||||
| 				 std::vector<cobj *> &pointers,int dimension,int plane,int cbmask, | ||||
| 				 compressor &compress,int type) | ||||
| { | ||||
|   assert( (table.size()&0x1)==0); | ||||
| @@ -92,14 +95,15 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La | ||||
|   int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane | ||||
|  | ||||
|   auto rhs_v = rhs.View(AcceleratorRead); | ||||
|   auto rhs_p = &rhs_v[0]; | ||||
|   auto p0=&pointers[0][0]; | ||||
|   auto p1=&pointers[1][0]; | ||||
|   auto tp=&table[0]; | ||||
|   accelerator_forNB(j, num, vobj::Nsimd(), { | ||||
|       compress.CompressExchange(p0,p1, &rhs_v[0], j, | ||||
| 			      so+tp[2*j  ].second, | ||||
| 			      so+tp[2*j+1].second, | ||||
| 			      type); | ||||
|       compress.CompressExchange(p0,p1, rhs_p, j, | ||||
| 				so+tp[2*j  ].second, | ||||
| 				so+tp[2*j+1].second, | ||||
| 				type); | ||||
|   }); | ||||
|   rhs_v.ViewClose(); | ||||
| } | ||||
| @@ -230,8 +234,8 @@ public: | ||||
|   }; | ||||
|   struct Merge { | ||||
|     cobj * mpointer; | ||||
|     Vector<scalar_object *> rpointers; | ||||
|     Vector<cobj *> vpointers; | ||||
|     //    std::vector<scalar_object *> rpointers; | ||||
|     std::vector<cobj *> vpointers; | ||||
|     Integer buffer_size; | ||||
|     Integer type; | ||||
|   }; | ||||
| @@ -406,6 +410,7 @@ public: | ||||
|       comms_bytes+=bytes; | ||||
|       shm_bytes  +=2*Packets[i].bytes-bytes; | ||||
|     } | ||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||
|   } | ||||
|  | ||||
|   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||
| @@ -420,7 +425,7 @@ public: | ||||
|   //////////////////////////////////////////////////////////////////////// | ||||
|   void Communicate(void) | ||||
|   { | ||||
|     if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){ | ||||
|     if ( 0 ){ | ||||
|       thread_region { | ||||
| 	// must be called in parallel region | ||||
| 	int mythread  = thread_num(); | ||||
| @@ -569,7 +574,7 @@ public: | ||||
|     d.buffer_size = buffer_size; | ||||
|     dv.push_back(d); | ||||
|   } | ||||
|   void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) { | ||||
|   void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) { | ||||
|     Merge m; | ||||
|     m.type     = type; | ||||
|     m.mpointer = merge_p; | ||||
| @@ -582,6 +587,7 @@ public: | ||||
|   } | ||||
|   template<class decompressor>  void CommsMergeSHM(decompressor decompress) { | ||||
|     mpi3synctime-=usecond(); | ||||
|     accelerator_barrier(); | ||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||
|     mpi3synctime+=usecond(); | ||||
|     shmmergetime-=usecond(); | ||||
| @@ -1114,8 +1120,8 @@ public: | ||||
|     int bytes = (reduced_buffer_size*datum_bytes)/simd_layout; | ||||
|     assert(bytes*simd_layout == reduced_buffer_size*datum_bytes); | ||||
|  | ||||
|     Vector<cobj *> rpointers(maxl); | ||||
|     Vector<cobj *> spointers(maxl); | ||||
|     std::vector<cobj *> rpointers(maxl); | ||||
|     std::vector<cobj *> spointers(maxl); | ||||
|  | ||||
|     /////////////////////////////////////////// | ||||
|     // Work out what to send where | ||||
|   | ||||
| @@ -195,12 +195,15 @@ void acceleratorInit(void) | ||||
| #ifdef GRID_SYCL | ||||
|  | ||||
| cl::sycl::queue *theGridAccelerator; | ||||
| cl::sycl::queue *theCopyAccelerator; | ||||
| void acceleratorInit(void) | ||||
| { | ||||
|   int nDevices = 1; | ||||
|   cl::sycl::gpu_selector selector; | ||||
|   cl::sycl::device selectedDevice { selector }; | ||||
|   theGridAccelerator = new sycl::queue (selectedDevice); | ||||
|   //  theCopyAccelerator = new sycl::queue (selectedDevice); | ||||
|   theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. | ||||
|  | ||||
| #ifdef GRID_SYCL_LEVEL_ZERO_IPC | ||||
|   zeInit(0); | ||||
|   | ||||
| @@ -247,7 +247,6 @@ inline int  acceleratorIsCommunicable(void *ptr) | ||||
| ////////////////////////////////////////////// | ||||
| // SyCL acceleration | ||||
| ////////////////////////////////////////////// | ||||
|  | ||||
| #ifdef GRID_SYCL | ||||
| NAMESPACE_END(Grid); | ||||
| #include <CL/sycl.hpp> | ||||
| @@ -262,6 +261,7 @@ NAMESPACE_END(Grid); | ||||
| NAMESPACE_BEGIN(Grid); | ||||
|  | ||||
| extern cl::sycl::queue *theGridAccelerator; | ||||
| extern cl::sycl::queue *theCopyAccelerator; | ||||
|  | ||||
| #ifdef __SYCL_DEVICE_ONLY__ | ||||
| #define GRID_SIMT | ||||
| @@ -289,7 +289,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { | ||||
|       cgh.parallel_for(					\ | ||||
|       cl::sycl::nd_range<3>(global,local), \ | ||||
|       [=] (cl::sycl::nd_item<3> item) /*mutable*/     \ | ||||
|       [[intel::reqd_sub_group_size(8)]]	      \ | ||||
|       [[intel::reqd_sub_group_size(16)]]	      \ | ||||
|       {						      \ | ||||
|       auto iter1    = item.get_global_id(0);	      \ | ||||
|       auto iter2    = item.get_global_id(1);	      \ | ||||
| @@ -298,19 +298,19 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { | ||||
|      });	   			              \ | ||||
|     }); | ||||
|  | ||||
| #define accelerator_barrier(dummy) theGridAccelerator->wait(); | ||||
| #define accelerator_barrier(dummy) { theGridAccelerator->wait(); } | ||||
|  | ||||
| inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; | ||||
| inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; | ||||
| inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; | ||||
| inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; | ||||
| inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { | ||||
|   theGridAccelerator->memcpy(to,from,bytes); | ||||
| } | ||||
| inline void acceleratorCopySynchronise(void) {  theGridAccelerator->wait(); std::cout<<"acceleratorCopySynchronise() wait "<<std::endl; } | ||||
| inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} | ||||
| inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} | ||||
| inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();} | ||||
|  | ||||
| inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); } | ||||
| inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);} | ||||
| inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} | ||||
| inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} | ||||
| inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} | ||||
|  | ||||
| inline int  acceleratorIsCommunicable(void *ptr) | ||||
| { | ||||
| #if 0 | ||||
| @@ -511,7 +511,16 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN, | ||||
| inline void acceleratorFreeCpu  (void *ptr){free(ptr);}; | ||||
| #endif | ||||
|  | ||||
| ////////////////////////////////////////////// | ||||
| // Fencing needed ONLY for SYCL | ||||
| ////////////////////////////////////////////// | ||||
|  | ||||
| #ifdef GRID_SYCL | ||||
| inline void acceleratorFenceComputeStream(void){ accelerator_barrier();}; | ||||
| #else | ||||
| // Ordering within a stream guaranteed on Nvidia & AMD | ||||
| inline void acceleratorFenceComputeStream(void){ }; | ||||
| #endif | ||||
|  | ||||
| /////////////////////////////////////////////////// | ||||
| // Synchronise across local threads for divergence resynch | ||||
|   | ||||
| @@ -27,6 +27,7 @@ | ||||
|     /*  END LEGAL */ | ||||
| extern "C" { | ||||
| #include <openssl/sha.h> | ||||
| #include <openssl/evp.h> | ||||
| } | ||||
| #ifdef USE_IPP | ||||
| #include "ipp.h" | ||||
| @@ -70,10 +71,8 @@ public: | ||||
|   static inline std::vector<unsigned char> sha256(const void *data,size_t bytes) | ||||
|   { | ||||
|     std::vector<unsigned char> hash(SHA256_DIGEST_LENGTH); | ||||
|     SHA256_CTX sha256; | ||||
|     SHA256_Init  (&sha256); | ||||
|     SHA256_Update(&sha256, data,bytes); | ||||
|     SHA256_Final (&hash[0], &sha256); | ||||
|     auto digest = EVP_get_digestbyname("SHA256"); | ||||
|     EVP_Digest(data, bytes, &hash[0], NULL, digest, NULL); | ||||
|     return hash; | ||||
|   } | ||||
|   static inline std::vector<int> sha256_seeds(const std::string &s) | ||||
|   | ||||
| @@ -148,7 +148,7 @@ If you want to build all the tests at once just use `make tests`. | ||||
| - `--enable-mkl[=<path>]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional). | ||||
| - `--enable-numa`: enable NUMA first touch optimisation | ||||
| - `--enable-simd=<code>`: setup Grid for the SIMD target `<code>` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. | ||||
| - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). | ||||
| - `--enable-gen-simd-width=<size>`: select the size (in bytes) of the generic SIMD vector type (default: 64 bytes). | ||||
| - `--enable-comms=<comm>`: Use `<comm>` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. | ||||
| - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `). | ||||
| - `--disable-timers`: disable system dependent high-resolution timers. | ||||
|   | ||||
							
								
								
									
										131
									
								
								benchmarks/Benchmark_halo.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										131
									
								
								benchmarks/Benchmark_halo.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,131 @@ | ||||
|  /************************************************************************************* | ||||
|     Grid physics library, www.github.com/paboyle/Grid | ||||
|     Source file: ./benchmarks/Benchmark_dwf.cc | ||||
|     Copyright (C) 2015 | ||||
|  | ||||
|     Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||
|     Author: paboyle <paboyle@ph.ed.ac.uk> | ||||
|  | ||||
|     This program is free software; you can redistribute it and/or modify | ||||
|     it under the terms of the GNU General Public License as published by | ||||
|     the Free Software Foundation; either version 2 of the License, or | ||||
|     (at your option) any later version. | ||||
|     This program is distributed in the hope that it will be useful, | ||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|     GNU General Public License for more details. | ||||
|     You should have received a copy of the GNU General Public License along | ||||
|     with this program; if not, write to the Free Software Foundation, Inc., | ||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||
|     See the full license in the file "LICENSE" in the top level distribution directory | ||||
|     *************************************************************************************/ | ||||
|     /*  END LEGAL */ | ||||
| #include <Grid/Grid.h> | ||||
| #ifdef GRID_CUDA | ||||
| #define CUDA_PROFILE | ||||
| #endif | ||||
|  | ||||
| #ifdef CUDA_PROFILE | ||||
| #include <cuda_profiler_api.h> | ||||
| #endif | ||||
|  | ||||
| using namespace std; | ||||
| using namespace Grid; | ||||
|  | ||||
| template<class d> | ||||
| struct scal { | ||||
|   d internal; | ||||
| }; | ||||
|  | ||||
|   Gamma::Algebra Gmu [] = { | ||||
|     Gamma::Algebra::GammaX, | ||||
|     Gamma::Algebra::GammaY, | ||||
|     Gamma::Algebra::GammaZ, | ||||
|     Gamma::Algebra::GammaT | ||||
|   }; | ||||
|  | ||||
|  | ||||
| int main (int argc, char ** argv) | ||||
| { | ||||
|   Grid_init(&argc,&argv); | ||||
|  | ||||
|   Coordinate latt4= GridDefaultLatt(); | ||||
|   Coordinate mpi  = GridDefaultMpi(); | ||||
|   Coordinate simd = GridDefaultSimd(Nd,vComplexF::Nsimd()); | ||||
|  | ||||
|   GridLogLayout(); | ||||
|  | ||||
|   int Ls=16; | ||||
|   for(int i=0;i<argc;i++) | ||||
|     if(std::string(argv[i]) == "-Ls"){ | ||||
|       std::stringstream ss(argv[i+1]); ss >> Ls; | ||||
|     } | ||||
|  | ||||
|    | ||||
|   GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,simd ,mpi); | ||||
|   GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||
|   GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||
|   GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||
|  | ||||
|   std::cout << GridLogMessage << "Making s innermost grids"<<std::endl; | ||||
|   GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi()); | ||||
|   GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid); | ||||
|   GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid); | ||||
|   GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid); | ||||
|  | ||||
|   std::vector<int> seeds4({1,2,3,4}); | ||||
|   std::vector<int> seeds5({5,6,7,8}); | ||||
|  | ||||
|   std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; | ||||
|   GridParallelRNG          RNG4(UGrid);  RNG4.SeedUniqueString(std::string("The 4D RNG")); | ||||
|   std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; | ||||
|   GridParallelRNG          RNG5(FGrid);  RNG5.SeedUniqueString(std::string("The 5D RNG")); | ||||
|   std::cout << GridLogMessage << "Initialised RNGs" << std::endl; | ||||
|  | ||||
|   LatticeFermionF src   (FGrid); random(RNG5,src); | ||||
|   RealD N2 = 1.0/::sqrt(norm2(src)); | ||||
|   src = src*N2; | ||||
|  | ||||
|   std::cout << GridLogMessage << "Drawing gauge field" << std::endl; | ||||
|   LatticeGaugeFieldF Umu(UGrid); | ||||
|   SU<Nc>::HotConfiguration(RNG4,Umu); | ||||
|   std::cout << GridLogMessage << "Random gauge initialised " << std::endl; | ||||
|  | ||||
|   RealD mass=0.1; | ||||
|   RealD M5  =1.8; | ||||
|  | ||||
|   RealD NP = UGrid->_Nprocessors; | ||||
|   RealD NN = UGrid->NodeCount(); | ||||
|  | ||||
|   DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); | ||||
|  | ||||
|   const int ncall = 500; | ||||
|   std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||
|   std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::HaloGatherOpt         "<<std::endl; | ||||
|   std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||
|   { | ||||
|     typename DomainWallFermionF::Compressor compressor(0); | ||||
|     FGrid->Barrier(); | ||||
|     Dw.Stencil.HaloExchangeOptGather(src,compressor); | ||||
|     double t0=usecond(); | ||||
|     for(int i=0;i<ncall;i++){ | ||||
|       Dw.Stencil.HaloExchangeOptGather(src,compressor); | ||||
|     } | ||||
|     double t1=usecond(); | ||||
|     FGrid->Barrier(); | ||||
|  | ||||
|     double bytes=0.0; | ||||
|     if(mpi[0]) bytes+=latt4[1]*latt4[2]*latt4[3]; | ||||
|     if(mpi[1]) bytes+=latt4[0]*latt4[2]*latt4[3]; | ||||
|     if(mpi[2]) bytes+=latt4[0]*latt4[1]*latt4[3]; | ||||
|     if(mpi[3]) bytes+=latt4[0]*latt4[1]*latt4[2]; | ||||
|     bytes = bytes * Ls * 8.* (24.+12.)* 2.0; | ||||
|  | ||||
|     std::cout<<GridLogMessage << "Gather us /call =   "<< (t1-t0)/ncall<<std::endl; | ||||
|     std::cout<<GridLogMessage << "Gather MBs /call =   "<< bytes*ncall/(t1-t0)<<std::endl; | ||||
|  | ||||
|   } | ||||
|  | ||||
|   Grid_finalize(); | ||||
|   exit(0); | ||||
| } | ||||
							
								
								
									
										62
									
								
								systems/PVC/benchmarks/run-1tile.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										62
									
								
								systems/PVC/benchmarks/run-1tile.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,62 @@ | ||||
| #!/bin/sh | ||||
| ##SBATCH -p PVC-SPR-QZEH  | ||||
| ##SBATCH -p PVC-ICX-QZNW | ||||
| #SBATCH -p QZ1J-ICX-PVC | ||||
| ##SBATCH -p QZ1J-SPR-PVC-2C | ||||
|  | ||||
| source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh | ||||
|  | ||||
| export NT=8 | ||||
|  | ||||
| export I_MPI_OFFLOAD=1 | ||||
| export I_MPI_OFFLOAD_TOPOLIB=level_zero | ||||
| export I_MPI_OFFLOAD_DOMAIN_SIZE=-1 | ||||
|  | ||||
| # export IGC_EnableLSCFenceUGMBeforeEOT=0 | ||||
| # export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False" | ||||
| export SYCL_DEVICE_FILTER=gpu,level_zero | ||||
| #export IGC_ShaderDumpEnable=1  | ||||
| #export IGC_DumpToCurrentDir=1 | ||||
| export I_MPI_OFFLOAD_CELL=tile | ||||
| export EnableImplicitScaling=0 | ||||
| export EnableWalkerPartition=0 | ||||
| export ZE_AFFINITY_MASK=0.0 | ||||
| mpiexec -launcher ssh -n 1 -host localhost  ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --device-mem 32768 | ||||
|  | ||||
| export ZE_AFFINITY_MASK=0 | ||||
| export I_MPI_OFFLOAD_CELL=device | ||||
| export EnableImplicitScaling=1 | ||||
| export EnableWalkerPartition=1 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| #mpiexec -launcher ssh -n 2 -host localhost  vtune -collect gpu-hotspots -knob gpu-sampling-interval=1 -data-limit=0 -r ./vtune_run4 -- ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1 | ||||
|  | ||||
| #mpiexec  -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1 | ||||
|  | ||||
| #mpiexec  -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 | ||||
|  | ||||
| #mpiexec  -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1 | ||||
|  | ||||
| #mpiexec  -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 | ||||
|  | ||||
| #mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 16.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 0 | ||||
| #mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 1 | ||||
|  | ||||
							
								
								
									
										34
									
								
								systems/PVC/benchmarks/run-2tile-mpi.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										34
									
								
								systems/PVC/benchmarks/run-2tile-mpi.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,34 @@ | ||||
| #!/bin/bash | ||||
| ##SBATCH -p PVC-SPR-QZEH  | ||||
| ##SBATCH -p PVC-ICX-QZNW | ||||
|  | ||||
| #SBATCH -p QZ1J-ICX-PVC | ||||
|  | ||||
| source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh | ||||
|  | ||||
| export NT=16 | ||||
|  | ||||
| # export IGC_EnableLSCFenceUGMBeforeEOT=0 | ||||
| # export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False" | ||||
| #export IGC_ShaderDumpEnable=1  | ||||
| #export IGC_DumpToCurrentDir=1 | ||||
| export I_MPI_OFFLOAD=1 | ||||
| export I_MPI_OFFLOAD_TOPOLIB=level_zero | ||||
| export I_MPI_OFFLOAD_DOMAIN_SIZE=-1 | ||||
| export SYCL_DEVICE_FILTER=gpu,level_zero | ||||
| export I_MPI_OFFLOAD_CELL=tile | ||||
| export EnableImplicitScaling=0 | ||||
| export EnableWalkerPartition=0 | ||||
| export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 | ||||
| export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 | ||||
| export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0 | ||||
|  | ||||
| for i in 0  | ||||
| do | ||||
| mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768 | ||||
| mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1  --device-mem 32768 | ||||
| done | ||||
| #mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.1x2.log | ||||
| #mpiexec -launcher ssh -n 2 -host localhost  ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT  --shm-mpi 1 > halo.2tile.2x1.log | ||||
|  | ||||
|  | ||||
							
								
								
									
										14
									
								
								systems/PVC/benchmarks/wrap.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										14
									
								
								systems/PVC/benchmarks/wrap.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,14 @@ | ||||
| #!/bin/sh | ||||
|  | ||||
| export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID | ||||
|  | ||||
| echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK | ||||
|  | ||||
|  | ||||
| if [ $MPI_LOCALRANKID = "0" ]  | ||||
| then | ||||
| #  ~psteinbr/build_pti/ze_tracer -h $@ | ||||
|   onetrace --chrome-device-timeline $@ | ||||
| else | ||||
|   $@ | ||||
| fi | ||||
							
								
								
									
										16
									
								
								systems/PVC/config-command
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								systems/PVC/config-command
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | ||||
| INSTALL=/nfs/site/home/azusayax/install | ||||
| ../../configure \ | ||||
| 	--enable-simd=GPU \ | ||||
| 	--enable-gen-simd-width=64 \ | ||||
| 	--enable-comms=mpi-auto \ | ||||
| 	--disable-accelerator-cshift \ | ||||
| 	--disable-gparity \ | ||||
| 	--disable-fermion-reps \ | ||||
| 	--enable-shm=nvlink \ | ||||
| 	--enable-accelerator=sycl \ | ||||
| 	--enable-unified=no \ | ||||
| 	MPICXX=mpicxx \ | ||||
| 	CXX=dpcpp \ | ||||
| 	LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \ | ||||
| 	CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare" | ||||
|  | ||||
							
								
								
									
										11
									
								
								systems/PVC/setup.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								systems/PVC/setup.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,11 @@ | ||||
| export https_proxy=http://proxy-chain.intel.com:911 | ||||
| export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH | ||||
|  | ||||
| module load intel-release | ||||
| source /opt/intel/oneapi/PVC_setup.sh | ||||
| #source /opt/intel/oneapi/ATS_setup.sh | ||||
| module load intel/mpich/pvc45.3 | ||||
| export PATH=~/ATS/pti-gpu/tools/onetrace/:$PATH | ||||
|  | ||||
| #clsh embargo-ci-neo-022845 | ||||
| #source /opt/intel/vtune_amplifier/amplxe-vars.sh | ||||
| @@ -793,6 +793,7 @@ int main (int argc, char ** argv) | ||||
|     } | ||||
|     std::cout <<" OK ! "<<std::endl; | ||||
|  | ||||
| #ifdef USE_FP16 | ||||
|     // Double to Half | ||||
|     std::cout << GridLogMessage<< "Double to half" ; | ||||
|     precisionChange(&H[0],&D[0],Ndp); | ||||
| @@ -822,6 +823,7 @@ int main (int argc, char ** argv) | ||||
|       assert( tmp < 1.0e-3 ); | ||||
|     } | ||||
|     std::cout <<" OK ! "<<std::endl; | ||||
| #endif | ||||
|  | ||||
|   } | ||||
|   Grid_finalize(); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user