diff --git a/.gitignore b/.gitignore index 5338acb9..40156f9d 100644 --- a/.gitignore +++ b/.gitignore @@ -88,6 +88,7 @@ Thumbs.db # build directory # ################### build*/* +Documentation/_build # IDE related files # ##################### diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index b9594678..2fd187ff 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -442,6 +442,8 @@ public: for(int p=0; poSites()*nbasis, Nsimd, { @@ -453,7 +455,7 @@ public: StencilEntry *SE; for(int p=0;p AcceleratorViewContainer; for(int p=0;p_is_local) { @@ -754,7 +758,7 @@ public: StencilEntry *SE; for(int p=0;p_is_local) { diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index ad42f049..29f0ec4b 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -136,7 +136,7 @@ public: flops=0; usec =0; Coordinate layout(Nd,1); - sgrid = new GridCartesian(dimensions,layout,processors); + sgrid = new GridCartesian(dimensions,layout,processors,*grid); }; ~FFT ( void) { @@ -182,7 +182,7 @@ public: pencil_gd[dim] = G*processors[dim]; // Pencil global vol LxLxGxLxL per node - GridCartesian pencil_g(pencil_gd,layout,processors); + GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid); // Construct pencils typedef typename vobj::scalar_object sobj; diff --git a/Grid/algorithms/iterative/SchurRedBlack.h b/Grid/algorithms/iterative/SchurRedBlack.h index d0b133a3..15ef95c7 100644 --- a/Grid/algorithms/iterative/SchurRedBlack.h +++ b/Grid/algorithms/iterative/SchurRedBlack.h @@ -132,6 +132,31 @@ namespace Grid { (*this)(_Matrix,in,out,guess); } + void RedBlackSource(Matrix &_Matrix, const std::vector &in, std::vector &src_o) + { + GridBase *grid = _Matrix.RedBlackGrid(); + Field tmp(grid); + int nblock = in.size(); + for(int b=0;b &in, const std::vector &sol_o, std::vector &out) + { + GridBase *grid = _Matrix.RedBlackGrid(); + Field tmp(grid); + int nblock = in.size(); + for(int b=0;b void operator()(Matrix &_Matrix, const std::vector &in, std::vector &out,Guesser &guess) { @@ -150,9 +175,11 @@ namespace Grid { //////////////////////////////////////////////// // Prepare RedBlack source //////////////////////////////////////////////// - for(int b=0;b NAMESPACE_BEGIN(Grid); +bool Stencil_force_mpi = true; + /////////////////////////////////////////////////////////////// // Info that is setup once and indept of cartesian layout /////////////////////////////////////////////////////////////// diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index a15f9789..ffcfe37a 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -35,6 +35,8 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); +extern bool Stencil_force_mpi ; + class CartesianCommunicator : public SharedMemory { public: diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 5713fe35..01335b41 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -370,7 +370,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorShmBufferTranslate(dest,recv); + assert(shm!=NULL); + acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes); + acceleratorCopySynchronise(); // MPI prob slower } if ( CommunicatorPolicy == CommunicatorPolicySequential ) { diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 11788744..795f3928 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -513,26 +513,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) /////////////////////////////////////////////////////////////////////////////////////////////////////////// // Each MPI rank should allocate our own buffer /////////////////////////////////////////////////////////////////////////////////////////////////////////// -#ifdef GRID_SYCL_LEVEL_ZERO_IPC - auto zeDevice = cl::sycl::get_native(theGridAccelerator->get_device()); - auto zeContext= cl::sycl::get_native(theGridAccelerator->get_context()); - ze_device_mem_alloc_desc_t zeDesc = {}; - zeMemAllocDevice(zeContext,&zeDesc,bytes,2*1024*1024,zeDevice,&ShmCommBuf); - std::cout << WorldRank << header " SharedMemoryMPI.cc zeMemAllocDevice "<< bytes - << "bytes at "<< std::hex<< ShmCommBuf <(theGridAccelerator->get_device()); + auto zeContext = cl::sycl::get_native(theGridAccelerator->get_context()); + + ze_ipc_mem_handle_t ihandle; + clone_mem_t handle; + if ( r==WorldShmRank ) { - auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&handle); + auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle); if ( err != ZE_RESULT_SUCCESS ) { - std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "< &ret,sobj a,const Lattice &x,const Lattice & autoView( x_v , x, AcceleratorRead); autoView( y_v , y, AcceleratorRead); accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ - auto tmp = a*x_v(ss)+y_v(ss); + auto tmp = a*coalescedRead(x_v[ss])+coalescedRead(y_v[ss]); coalescedWrite(ret_v[ss],tmp); }); } diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index 863b2548..0928cbd7 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -125,7 +125,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) for(int k=k0; k inline void sliceSum(const Lattice &Data,std::vector< // But easily avoided by using double precision fields /////////////////////////////////////////////////////// typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_object::scalar_type scalar_type; GridBase *grid = Data.Grid(); assert(grid!=NULL); @@ -419,20 +420,19 @@ template inline void sliceSum(const Lattice &Data,std::vector< } // sum over nodes. - sobj gsum; for(int t=0;t_processor_coor[orthogdim] ) { - gsum=lsSum[lt]; + result[t]=lsSum[lt]; } else { - gsum=Zero(); + result[t]=Zero(); } - grid->GlobalSum(gsum); - - result[t]=gsum; } + scalar_type * ptr = (scalar_type *) &result[0]; + int words = fd*sizeof(sobj)/sizeof(scalar_type); + grid->GlobalSumVector(ptr, words); } template diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 7adf09d7..43a07c2d 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -364,16 +364,22 @@ inline void blockSum(Lattice &coarseData,const Lattice &fineData) autoView( coarseData_ , coarseData, AcceleratorWrite); autoView( fineData_ , fineData, AcceleratorRead); + auto coarseData_p = &coarseData_[0]; + auto fineData_p = &fineData_[0]; + Coordinate fine_rdimensions = fine->_rdimensions; Coordinate coarse_rdimensions = coarse->_rdimensions; + + vobj zz = Zero(); accelerator_for(sc,coarse->oSites(),1,{ // One thread per sub block Coordinate coor_c(_ndimension); Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate - coarseData_[sc]=Zero(); + vobj cd = zz; + for(int sb=0;sb &coarseData,const Lattice &fineData) for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d]; Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions); - coarseData_[sc]=coarseData_[sc]+fineData_[sf]; + cd=cd+fineData_p[sf]; } + coarseData_p[sc] = cd; + }); return; } diff --git a/Grid/qcd/action/fermion/Fermion.h b/Grid/qcd/action/fermion/Fermion.h index 09777204..59fc17d5 100644 --- a/Grid/qcd/action/fermion/Fermion.h +++ b/Grid/qcd/action/fermion/Fermion.h @@ -115,9 +115,9 @@ typedef WilsonFermion WilsonFermionR; typedef WilsonFermion WilsonFermionF; typedef WilsonFermion WilsonFermionD; -typedef WilsonFermion WilsonFermionRL; -typedef WilsonFermion WilsonFermionFH; -typedef WilsonFermion WilsonFermionDF; +//typedef WilsonFermion WilsonFermionRL; +//typedef WilsonFermion WilsonFermionFH; +//typedef WilsonFermion WilsonFermionDF; typedef WilsonFermion WilsonAdjFermionR; typedef WilsonFermion WilsonAdjFermionF; @@ -158,41 +158,41 @@ typedef DomainWallFermion DomainWallFermionR; typedef DomainWallFermion DomainWallFermionF; typedef DomainWallFermion DomainWallFermionD; -typedef DomainWallFermion DomainWallFermionRL; -typedef DomainWallFermion DomainWallFermionFH; -typedef DomainWallFermion DomainWallFermionDF; +//typedef DomainWallFermion DomainWallFermionRL; +//typedef DomainWallFermion DomainWallFermionFH; +//typedef DomainWallFermion DomainWallFermionDF; typedef DomainWallEOFAFermion DomainWallEOFAFermionR; typedef DomainWallEOFAFermion DomainWallEOFAFermionF; typedef DomainWallEOFAFermion DomainWallEOFAFermionD; -typedef DomainWallEOFAFermion DomainWallEOFAFermionRL; -typedef DomainWallEOFAFermion DomainWallEOFAFermionFH; -typedef DomainWallEOFAFermion DomainWallEOFAFermionDF; +//typedef DomainWallEOFAFermion DomainWallEOFAFermionRL; +//typedef DomainWallEOFAFermion DomainWallEOFAFermionFH; +//typedef DomainWallEOFAFermion DomainWallEOFAFermionDF; typedef MobiusFermion MobiusFermionR; typedef MobiusFermion MobiusFermionF; typedef MobiusFermion MobiusFermionD; -typedef MobiusFermion MobiusFermionRL; -typedef MobiusFermion MobiusFermionFH; -typedef MobiusFermion MobiusFermionDF; +//typedef MobiusFermion MobiusFermionRL; +//typedef MobiusFermion MobiusFermionFH; +//typedef MobiusFermion MobiusFermionDF; typedef MobiusEOFAFermion MobiusEOFAFermionR; typedef MobiusEOFAFermion MobiusEOFAFermionF; typedef MobiusEOFAFermion MobiusEOFAFermionD; -typedef MobiusEOFAFermion MobiusEOFAFermionRL; -typedef MobiusEOFAFermion MobiusEOFAFermionFH; -typedef MobiusEOFAFermion MobiusEOFAFermionDF; +//typedef MobiusEOFAFermion MobiusEOFAFermionRL; +//typedef MobiusEOFAFermion MobiusEOFAFermionFH; +//typedef MobiusEOFAFermion MobiusEOFAFermionDF; typedef ZMobiusFermion ZMobiusFermionR; typedef ZMobiusFermion ZMobiusFermionF; typedef ZMobiusFermion ZMobiusFermionD; -typedef ZMobiusFermion ZMobiusFermionRL; -typedef ZMobiusFermion ZMobiusFermionFH; -typedef ZMobiusFermion ZMobiusFermionDF; +//typedef ZMobiusFermion ZMobiusFermionRL; +//typedef ZMobiusFermion ZMobiusFermionFH; +//typedef ZMobiusFermion ZMobiusFermionDF; // Ls vectorised typedef ScaledShamirFermion ScaledShamirFermionR; @@ -235,49 +235,49 @@ typedef WilsonFermion GparityWilsonFermionR; typedef WilsonFermion GparityWilsonFermionF; typedef WilsonFermion GparityWilsonFermionD; -typedef WilsonFermion GparityWilsonFermionRL; -typedef WilsonFermion GparityWilsonFermionFH; -typedef WilsonFermion GparityWilsonFermionDF; +//typedef WilsonFermion GparityWilsonFermionRL; +//typedef WilsonFermion GparityWilsonFermionFH; +//typedef WilsonFermion GparityWilsonFermionDF; typedef DomainWallFermion GparityDomainWallFermionR; typedef DomainWallFermion GparityDomainWallFermionF; typedef DomainWallFermion GparityDomainWallFermionD; -typedef DomainWallFermion GparityDomainWallFermionRL; -typedef DomainWallFermion GparityDomainWallFermionFH; -typedef DomainWallFermion GparityDomainWallFermionDF; +//typedef DomainWallFermion GparityDomainWallFermionRL; +//typedef DomainWallFermion GparityDomainWallFermionFH; +//typedef DomainWallFermion GparityDomainWallFermionDF; typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionR; typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionF; typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionD; -typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionRL; -typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionFH; -typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionDF; +//typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionRL; +//typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionFH; +//typedef DomainWallEOFAFermion GparityDomainWallEOFAFermionDF; typedef WilsonTMFermion GparityWilsonTMFermionR; typedef WilsonTMFermion GparityWilsonTMFermionF; typedef WilsonTMFermion GparityWilsonTMFermionD; -typedef WilsonTMFermion GparityWilsonTMFermionRL; -typedef WilsonTMFermion GparityWilsonTMFermionFH; -typedef WilsonTMFermion GparityWilsonTMFermionDF; +//typedef WilsonTMFermion GparityWilsonTMFermionRL; +//typedef WilsonTMFermion GparityWilsonTMFermionFH; +//typedef WilsonTMFermion GparityWilsonTMFermionDF; typedef MobiusFermion GparityMobiusFermionR; typedef MobiusFermion GparityMobiusFermionF; typedef MobiusFermion GparityMobiusFermionD; -typedef MobiusFermion GparityMobiusFermionRL; -typedef MobiusFermion GparityMobiusFermionFH; -typedef MobiusFermion GparityMobiusFermionDF; +//typedef MobiusFermion GparityMobiusFermionRL; +//typedef MobiusFermion GparityMobiusFermionFH; +//typedef MobiusFermion GparityMobiusFermionDF; typedef MobiusEOFAFermion GparityMobiusEOFAFermionR; typedef MobiusEOFAFermion GparityMobiusEOFAFermionF; typedef MobiusEOFAFermion GparityMobiusEOFAFermionD; -typedef MobiusEOFAFermion GparityMobiusEOFAFermionRL; -typedef MobiusEOFAFermion GparityMobiusEOFAFermionFH; -typedef MobiusEOFAFermion GparityMobiusEOFAFermionDF; +//typedef MobiusEOFAFermion GparityMobiusEOFAFermionRL; +//typedef MobiusEOFAFermion GparityMobiusEOFAFermionFH; +//typedef MobiusEOFAFermion GparityMobiusEOFAFermionDF; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionR; typedef ImprovedStaggeredFermion ImprovedStaggeredFermionF; diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h index 5b0a67d3..8017bc76 100644 --- a/Grid/qcd/action/fermion/GparityWilsonImpl.h +++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h @@ -409,8 +409,8 @@ typedef GparityWilsonImpl Gparit typedef GparityWilsonImpl GparityWilsonImplF; // Float typedef GparityWilsonImpl GparityWilsonImplD; // Double -typedef GparityWilsonImpl GparityWilsonImplRL; // Real.. whichever prec -typedef GparityWilsonImpl GparityWilsonImplFH; // Float -typedef GparityWilsonImpl GparityWilsonImplDF; // Double +//typedef GparityWilsonImpl GparityWilsonImplRL; // Real.. whichever prec +//typedef GparityWilsonImpl GparityWilsonImplFH; // Float +//typedef GparityWilsonImpl GparityWilsonImplDF; // Double NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 0760bcba..e0e08c1c 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -68,11 +68,12 @@ public: /*****************************************************/ /* Compress includes precision change if mpi data is not same */ /*****************************************************/ - template - accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const { - _SiteHalfSpinor tmp; - projector::Proj(tmp,in,mu,dag); - vstream(buf[o],tmp); + accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const { + typedef decltype(coalescedRead(buf)) sobj; + sobj sp; + auto sin = coalescedRead(in); + projector::Proj(sp,sin,mu,dag); + coalescedWrite(buf,sp); } /*****************************************************/ @@ -82,13 +83,18 @@ public: const SiteHalfSpinor * __restrict__ vp0, const SiteHalfSpinor * __restrict__ vp1, Integer type,Integer o) const { +#ifdef GRID_SIMT + exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); +#else SiteHalfSpinor tmp1; SiteHalfSpinor tmp2; exchange(tmp1,tmp2,vp0[o],vp1[o],type); vstream(mp[2*o ],tmp1); vstream(mp[2*o+1],tmp2); +#endif } + /*****************************************************/ /* Have a decompression step if mpi data is not same */ /*****************************************************/ @@ -105,6 +111,28 @@ public: const SiteSpinor * __restrict__ in, Integer j,Integer k, Integer m,Integer type) const { +#ifdef GRID_SIMT + typedef SiteSpinor vobj; + typedef SiteHalfSpinor hvobj; + typedef decltype(coalescedRead(*in)) sobj; + typedef decltype(coalescedRead(*out0)) hsobj; + + unsigned int Nsimd = vobj::Nsimd(); + unsigned int mask = Nsimd >> (type + 1); + int lane = acceleratorSIMTlane(Nsimd); + int j0 = lane &(~mask); // inner coor zero + int j1 = lane |(mask) ; // inner coor one + const vobj *vp0 = &in[k]; + const vobj *vp1 = &in[m]; + const vobj *vp = (lane&mask) ? vp1:vp0; + auto sa = coalescedRead(*vp,j0); + auto sb = coalescedRead(*vp,j1); + hsobj psa, psb; + projector::Proj(psa,sa,mu,dag); + projector::Proj(psb,sb,mu,dag); + coalescedWrite(out0[j],psa); + coalescedWrite(out1[j],psb); +#else SiteHalfSpinor temp1, temp2; SiteHalfSpinor temp3, temp4; projector::Proj(temp1,in[k],mu,dag); @@ -112,6 +140,7 @@ public: exchange(temp3,temp4,temp1,temp2,type); vstream(out0[j],temp3); vstream(out1[j],temp4); +#endif } /*****************************************************/ @@ -121,6 +150,7 @@ public: }; +#if 0 template class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector, typename std::enable_if::value>::type > @@ -149,13 +179,23 @@ public: /*****************************************************/ /* Compress includes precision change if mpi data is not same */ /*****************************************************/ - template - accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const { - _SiteHalfSpinor hsp; + accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const { + SiteHalfSpinor hsp; SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf; projector::Proj(hsp,in,mu,dag); precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw); } + accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const { +#ifdef GRID_SIMT + typedef decltype(coalescedRead(buf)) sobj; + sobj sp; + auto sin = coalescedRead(in); + projector::Proj(sp,sin,mu,dag); + coalescedWrite(buf,sp); +#else + projector::Proj(buf,in,mu,dag); +#endif + } /*****************************************************/ /* Exchange includes precision change if mpi data is not same */ @@ -203,6 +243,7 @@ public: accelerator_inline bool DecompressionStep(void) const { return true; } }; +#endif #define DECLARE_PROJ(Projector,Compressor,spProj) \ class Projector { \ @@ -253,33 +294,8 @@ public: typedef typename Base::View_type View_type; typedef typename Base::StencilVector StencilVector; - double timer0; - double timer1; - double timer2; - double timer3; - double timer4; - double timer5; - double timer6; - uint64_t callsi; - void ZeroCountersi(void) - { - timer0=0; - timer1=0; - timer2=0; - timer3=0; - timer4=0; - timer5=0; - timer6=0; - callsi=0; - } - void Reporti(int calls) - { - if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " < surface_list; @@ -321,26 +337,18 @@ public: { std::vector > reqs; this->HaloExchangeOptGather(source,compress); - double t1=usecond(); // Asynchronous MPI calls multidirectional, Isend etc... // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways. this->Communicate(); - double t2=usecond(); timer1 += t2-t1; this->CommsMerge(compress); - double t3=usecond(); timer2 += t3-t2; this->CommsMergeSHM(compress); - double t4=usecond(); timer3 += t4-t3; } template void HaloExchangeOptGather(const Lattice &source,compressor &compress) { this->Prepare(); - double t0=usecond(); this->HaloGatherOpt(source,compress); - double t1=usecond(); - timer0 += t1-t0; - callsi++; } template @@ -352,12 +360,9 @@ public: typedef typename compressor::SiteHalfSpinor SiteHalfSpinor; typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; - this->mpi3synctime_g-=usecond(); this->_grid->StencilBarrier(); - this->mpi3synctime_g+=usecond(); assert(source.Grid()==this->_grid); - this->halogtime-=usecond(); this->u_comm_offset=0; @@ -393,7 +398,6 @@ public: } this->face_table_computed=1; assert(this->u_comm_offset==this->_unified_buffer_size); - this->halogtime+=usecond(); accelerator_barrier(); } diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index 2ff6feba..2685796d 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -243,17 +243,17 @@ typedef WilsonImpl WilsonImplR typedef WilsonImpl WilsonImplF; // Float typedef WilsonImpl WilsonImplD; // Double -typedef WilsonImpl WilsonImplRL; // Real.. whichever prec -typedef WilsonImpl WilsonImplFH; // Float -typedef WilsonImpl WilsonImplDF; // Double +//typedef WilsonImpl WilsonImplRL; // Real.. whichever prec +//typedef WilsonImpl WilsonImplFH; // Float +//typedef WilsonImpl WilsonImplDF; // Double typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec typedef WilsonImpl ZWilsonImplF; // Float typedef WilsonImpl ZWilsonImplD; // Double -typedef WilsonImpl ZWilsonImplRL; // Real.. whichever prec -typedef WilsonImpl ZWilsonImplFH; // Float -typedef WilsonImpl ZWilsonImplDF; // Double +//typedef WilsonImpl ZWilsonImplRL; // Real.. whichever prec +//typedef WilsonImpl ZWilsonImplFH; // Float +//typedef WilsonImpl ZWilsonImplDF; // Double typedef WilsonImpl WilsonAdjImplR; // Real.. whichever prec typedef WilsonImpl WilsonAdjImplF; // Float diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index c3e0f821..1ed66bda 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -880,7 +880,7 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, } std::vector G_s(Ls,1.0); - Integer sign = 1; // sign flip for vector/tadpole + RealD sign = 1; // sign flip for vector/tadpole if ( curr_type == Current::Axial ) { for(int s=0;s::SeqConservedCurrent(PropagatorField &q_in, for(int s=0;sbs[s]; diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index ffec05a0..35d1b841 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -73,17 +73,17 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR @@ -102,17 +102,17 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR @@ -131,17 +131,17 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include @@ -165,17 +165,17 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR @@ -194,17 +194,17 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR @@ -223,17 +223,17 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include @@ -280,17 +280,17 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include #undef INTERIOR_AND_EXTERIOR @@ -309,17 +309,17 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include #undef INTERIOR_AND_EXTERIOR @@ -338,17 +338,17 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include ///////////////////////////////////////////////////////////////// @@ -371,17 +371,17 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include #undef INTERIOR_AND_EXTERIOR @@ -400,17 +400,17 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include #undef INTERIOR_AND_EXTERIOR @@ -429,17 +429,17 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include -#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") +// template<> void +// WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +// #include diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h index 4aed13bf..e025ba41 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h @@ -74,15 +74,15 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +// +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -97,15 +97,15 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +// +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR @@ -121,15 +121,15 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +// +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, single @@ -148,15 +148,15 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +// +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -171,15 +171,15 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +// +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -194,15 +194,15 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +// +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef MAYBEPERM #undef MULT_2SPIN @@ -228,14 +228,14 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeF int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -249,14 +249,14 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGau int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -273,15 +273,15 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGau int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include - -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +// +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include ///////////////////////////////////////////////////////////////// // Ls vectorised, dag Kernel, single @@ -299,14 +299,14 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGau int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -320,14 +320,14 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, Doubled int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -341,14 +341,14 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Doubled int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #endif // VEC 5D @@ -392,14 +392,14 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -413,14 +413,14 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -434,14 +434,14 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, single @@ -459,14 +459,14 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -480,14 +480,14 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -501,14 +501,14 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef MAYBEPERM #undef MULT_2SPIN @@ -533,14 +533,14 @@ WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeF int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -554,14 +554,14 @@ WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGau int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -577,14 +577,14 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGau int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include ///////////////////////////////////////////////////////////////// // Ls vectorised, dag Kernel, single @@ -602,14 +602,14 @@ WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGau int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #define INTERIOR @@ -623,14 +623,14 @@ WilsonKernels::AsmDhopSiteDagInt(StencilView &st, Doubled int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #undef INTERIOR_AND_EXTERIOR #undef INTERIOR @@ -645,14 +645,14 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Doubled int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include -template<> void -WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#include +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include +//template<> void +//WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, +// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) +//#include #endif // VEC 5D diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/CayleyFermion5DInstantiationGparityWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/CayleyFermion5DInstantiationGparityWilsonImplDF.cc deleted file mode 120000 index cb1db625..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/CayleyFermion5DInstantiationGparityWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../CayleyFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/ContinuedFractionFermion5DInstantiationGparityWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/ContinuedFractionFermion5DInstantiationGparityWilsonImplDF.cc deleted file mode 120000 index c2d4b8fc..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/ContinuedFractionFermion5DInstantiationGparityWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../ContinuedFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/DomainWallEOFAFermionInstantiationGparityWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/DomainWallEOFAFermionInstantiationGparityWilsonImplDF.cc deleted file mode 120000 index 2f550a2b..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/DomainWallEOFAFermionInstantiationGparityWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../DomainWallEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/MobiusEOFAFermionInstantiationGparityWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/MobiusEOFAFermionInstantiationGparityWilsonImplDF.cc deleted file mode 120000 index 7a8f1172..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/MobiusEOFAFermionInstantiationGparityWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../MobiusEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/PartialFractionFermion5DInstantiationGparityWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/PartialFractionFermion5DInstantiationGparityWilsonImplDF.cc deleted file mode 120000 index 7f4cea71..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/PartialFractionFermion5DInstantiationGparityWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../PartialFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonCloverFermionInstantiationGparityWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonCloverFermionInstantiationGparityWilsonImplDF.cc deleted file mode 120000 index 9cc05107..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonCloverFermionInstantiationGparityWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonFermion5DInstantiationGparityWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonFermion5DInstantiationGparityWilsonImplDF.cc deleted file mode 120000 index 804d0884..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonFermion5DInstantiationGparityWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonFermionInstantiationGparityWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonFermionInstantiationGparityWilsonImplDF.cc deleted file mode 120000 index 5f6ab65e..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonFermionInstantiationGparityWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonKernelsInstantiationGparityWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonKernelsInstantiationGparityWilsonImplDF.cc deleted file mode 120000 index 87adea48..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonKernelsInstantiationGparityWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiationGparity.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonTMFermionInstantiationGparityWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonTMFermionInstantiationGparityWilsonImplDF.cc deleted file mode 120000 index d5789bcf..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/WilsonTMFermionInstantiationGparityWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonTMFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/impl.h b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/impl.h deleted file mode 100644 index 2f13ce8a..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplDF/impl.h +++ /dev/null @@ -1 +0,0 @@ -#define IMPLEMENTATION GparityWilsonImplDF diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/CayleyFermion5DInstantiationGparityWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/CayleyFermion5DInstantiationGparityWilsonImplFH.cc deleted file mode 120000 index cb1db625..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/CayleyFermion5DInstantiationGparityWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../CayleyFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/ContinuedFractionFermion5DInstantiationGparityWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/ContinuedFractionFermion5DInstantiationGparityWilsonImplFH.cc deleted file mode 120000 index c2d4b8fc..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/ContinuedFractionFermion5DInstantiationGparityWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../ContinuedFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/DomainWallEOFAFermionInstantiationGparityWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/DomainWallEOFAFermionInstantiationGparityWilsonImplFH.cc deleted file mode 120000 index 2f550a2b..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/DomainWallEOFAFermionInstantiationGparityWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../DomainWallEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/MobiusEOFAFermionInstantiationGparityWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/MobiusEOFAFermionInstantiationGparityWilsonImplFH.cc deleted file mode 120000 index 7a8f1172..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/MobiusEOFAFermionInstantiationGparityWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../MobiusEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/PartialFractionFermion5DInstantiationGparityWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/PartialFractionFermion5DInstantiationGparityWilsonImplFH.cc deleted file mode 120000 index 7f4cea71..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/PartialFractionFermion5DInstantiationGparityWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../PartialFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonCloverFermionInstantiationGparityWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonCloverFermionInstantiationGparityWilsonImplFH.cc deleted file mode 120000 index 9cc05107..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonCloverFermionInstantiationGparityWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonFermion5DInstantiationGparityWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonFermion5DInstantiationGparityWilsonImplFH.cc deleted file mode 120000 index 804d0884..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonFermion5DInstantiationGparityWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonFermionInstantiationGparityWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonFermionInstantiationGparityWilsonImplFH.cc deleted file mode 120000 index 5f6ab65e..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonFermionInstantiationGparityWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonKernelsInstantiationGparityWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonKernelsInstantiationGparityWilsonImplFH.cc deleted file mode 120000 index 87adea48..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonKernelsInstantiationGparityWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonKernelsInstantiationGparity.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonTMFermionInstantiationGparityWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonTMFermionInstantiationGparityWilsonImplFH.cc deleted file mode 120000 index d5789bcf..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/WilsonTMFermionInstantiationGparityWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonTMFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/impl.h b/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/impl.h deleted file mode 100644 index ebcb6e62..00000000 --- a/Grid/qcd/action/fermion/instantiation/GparityWilsonImplFH/impl.h +++ /dev/null @@ -1 +0,0 @@ -#define IMPLEMENTATION GparityWilsonImplFH diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/CayleyFermion5DInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/CayleyFermion5DInstantiationWilsonImplDF.cc deleted file mode 120000 index cb1db625..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/CayleyFermion5DInstantiationWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../CayleyFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/ContinuedFractionFermion5DInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/ContinuedFractionFermion5DInstantiationWilsonImplDF.cc deleted file mode 120000 index c2d4b8fc..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/ContinuedFractionFermion5DInstantiationWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../ContinuedFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/DomainWallEOFAFermionInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/DomainWallEOFAFermionInstantiationWilsonImplDF.cc deleted file mode 120000 index 2f550a2b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/DomainWallEOFAFermionInstantiationWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../DomainWallEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/MobiusEOFAFermionInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/MobiusEOFAFermionInstantiationWilsonImplDF.cc deleted file mode 120000 index 7a8f1172..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/MobiusEOFAFermionInstantiationWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../MobiusEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/PartialFractionFermion5DInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/PartialFractionFermion5DInstantiationWilsonImplDF.cc deleted file mode 120000 index 7f4cea71..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/PartialFractionFermion5DInstantiationWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../PartialFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonCloverFermionInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonCloverFermionInstantiationWilsonImplDF.cc deleted file mode 120000 index 9cc05107..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonCloverFermionInstantiationWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonFermion5DInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonFermion5DInstantiationWilsonImplDF.cc deleted file mode 120000 index 804d0884..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonFermion5DInstantiationWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonFermionInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonFermionInstantiationWilsonImplDF.cc deleted file mode 120000 index 5f6ab65e..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonFermionInstantiationWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonTMFermionInstantiationWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonTMFermionInstantiationWilsonImplDF.cc deleted file mode 120000 index d5789bcf..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonTMFermionInstantiationWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonTMFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/impl.h b/Grid/qcd/action/fermion/instantiation/WilsonImplDF/impl.h deleted file mode 100644 index 2adc6136..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/impl.h +++ /dev/null @@ -1 +0,0 @@ -#define IMPLEMENTATION WilsonImplDF diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/CayleyFermion5DInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/CayleyFermion5DInstantiationWilsonImplFH.cc deleted file mode 120000 index cb1db625..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/CayleyFermion5DInstantiationWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../CayleyFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/ContinuedFractionFermion5DInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/ContinuedFractionFermion5DInstantiationWilsonImplFH.cc deleted file mode 120000 index c2d4b8fc..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/ContinuedFractionFermion5DInstantiationWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../ContinuedFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/DomainWallEOFAFermionInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/DomainWallEOFAFermionInstantiationWilsonImplFH.cc deleted file mode 120000 index 2f550a2b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/DomainWallEOFAFermionInstantiationWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../DomainWallEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/MobiusEOFAFermionInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/MobiusEOFAFermionInstantiationWilsonImplFH.cc deleted file mode 120000 index 7a8f1172..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/MobiusEOFAFermionInstantiationWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../MobiusEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/PartialFractionFermion5DInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/PartialFractionFermion5DInstantiationWilsonImplFH.cc deleted file mode 120000 index 7f4cea71..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/PartialFractionFermion5DInstantiationWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../PartialFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonCloverFermionInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonCloverFermionInstantiationWilsonImplFH.cc deleted file mode 120000 index 9cc05107..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonCloverFermionInstantiationWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonCloverFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonFermion5DInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonFermion5DInstantiationWilsonImplFH.cc deleted file mode 120000 index 804d0884..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonFermion5DInstantiationWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonFermionInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonFermionInstantiationWilsonImplFH.cc deleted file mode 120000 index 5f6ab65e..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonFermionInstantiationWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc deleted file mode 100644 index f0b15e3b..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonKernelsInstantiationWilsonImplFH.cc +++ /dev/null @@ -1,51 +0,0 @@ -/************************************************************************************* - -Grid physics library, www.github.com/paboyle/Grid - -Source file: ./lib/qcd/action/fermion/WilsonKernels.cc - -Copyright (C) 2015, 2020 - -Author: Peter Boyle -Author: Peter Boyle -Author: paboyle -Author: Nils Meyer Regensburg University - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along -with this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -See the full license in the file "LICENSE" in the top level distribution -directory -*************************************************************************************/ -/* END LEGAL */ -#include -#include -#include - -#ifndef AVX512 -#ifndef QPX -#ifndef A64FX -#ifndef A64FXFIXEDSIZE -#include -#endif -#endif -#endif -#endif - -NAMESPACE_BEGIN(Grid); - -#include "impl.h" -template class WilsonKernels; - -NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonTMFermionInstantiationWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonTMFermionInstantiationWilsonImplFH.cc deleted file mode 120000 index d5789bcf..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/WilsonTMFermionInstantiationWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonTMFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/impl.h b/Grid/qcd/action/fermion/instantiation/WilsonImplFH/impl.h deleted file mode 100644 index e442863d..00000000 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplFH/impl.h +++ /dev/null @@ -1 +0,0 @@ -#define IMPLEMENTATION WilsonImplFH diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/CayleyFermion5DInstantiationZWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/CayleyFermion5DInstantiationZWilsonImplDF.cc deleted file mode 120000 index cb1db625..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/CayleyFermion5DInstantiationZWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../CayleyFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/ContinuedFractionFermion5DInstantiationZWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/ContinuedFractionFermion5DInstantiationZWilsonImplDF.cc deleted file mode 120000 index c2d4b8fc..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/ContinuedFractionFermion5DInstantiationZWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../ContinuedFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/DomainWallEOFAFermionInstantiationZWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/DomainWallEOFAFermionInstantiationZWilsonImplDF.cc deleted file mode 120000 index 2f550a2b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/DomainWallEOFAFermionInstantiationZWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../DomainWallEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/MobiusEOFAFermionInstantiationZWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/MobiusEOFAFermionInstantiationZWilsonImplDF.cc deleted file mode 120000 index 7a8f1172..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/MobiusEOFAFermionInstantiationZWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../MobiusEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/PartialFractionFermion5DInstantiationZWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/PartialFractionFermion5DInstantiationZWilsonImplDF.cc deleted file mode 120000 index 7f4cea71..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/PartialFractionFermion5DInstantiationZWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../PartialFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonFermion5DInstantiationZWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonFermion5DInstantiationZWilsonImplDF.cc deleted file mode 120000 index 804d0884..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonFermion5DInstantiationZWilsonImplDF.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc deleted file mode 100644 index f0b15e3b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/WilsonKernelsInstantiationZWilsonImplDF.cc +++ /dev/null @@ -1,51 +0,0 @@ -/************************************************************************************* - -Grid physics library, www.github.com/paboyle/Grid - -Source file: ./lib/qcd/action/fermion/WilsonKernels.cc - -Copyright (C) 2015, 2020 - -Author: Peter Boyle -Author: Peter Boyle -Author: paboyle -Author: Nils Meyer Regensburg University - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along -with this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -See the full license in the file "LICENSE" in the top level distribution -directory -*************************************************************************************/ -/* END LEGAL */ -#include -#include -#include - -#ifndef AVX512 -#ifndef QPX -#ifndef A64FX -#ifndef A64FXFIXEDSIZE -#include -#endif -#endif -#endif -#endif - -NAMESPACE_BEGIN(Grid); - -#include "impl.h" -template class WilsonKernels; - -NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/impl.h b/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/impl.h deleted file mode 100644 index 7daf76ef..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplDF/impl.h +++ /dev/null @@ -1 +0,0 @@ -#define IMPLEMENTATION ZWilsonImplDF diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/CayleyFermion5DInstantiationZWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/CayleyFermion5DInstantiationZWilsonImplFH.cc deleted file mode 120000 index cb1db625..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/CayleyFermion5DInstantiationZWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../CayleyFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/ContinuedFractionFermion5DInstantiationZWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/ContinuedFractionFermion5DInstantiationZWilsonImplFH.cc deleted file mode 120000 index c2d4b8fc..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/ContinuedFractionFermion5DInstantiationZWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../ContinuedFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/DomainWallEOFAFermionInstantiationZWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/DomainWallEOFAFermionInstantiationZWilsonImplFH.cc deleted file mode 120000 index 2f550a2b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/DomainWallEOFAFermionInstantiationZWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../DomainWallEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/MobiusEOFAFermionInstantiationZWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/MobiusEOFAFermionInstantiationZWilsonImplFH.cc deleted file mode 120000 index 7a8f1172..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/MobiusEOFAFermionInstantiationZWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../MobiusEOFAFermionInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/PartialFractionFermion5DInstantiationZWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/PartialFractionFermion5DInstantiationZWilsonImplFH.cc deleted file mode 120000 index 7f4cea71..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/PartialFractionFermion5DInstantiationZWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../PartialFractionFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonFermion5DInstantiationZWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonFermion5DInstantiationZWilsonImplFH.cc deleted file mode 120000 index 804d0884..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonFermion5DInstantiationZWilsonImplFH.cc +++ /dev/null @@ -1 +0,0 @@ -../WilsonFermion5DInstantiation.cc.master \ No newline at end of file diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc deleted file mode 100644 index f0b15e3b..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/WilsonKernelsInstantiationZWilsonImplFH.cc +++ /dev/null @@ -1,51 +0,0 @@ -/************************************************************************************* - -Grid physics library, www.github.com/paboyle/Grid - -Source file: ./lib/qcd/action/fermion/WilsonKernels.cc - -Copyright (C) 2015, 2020 - -Author: Peter Boyle -Author: Peter Boyle -Author: paboyle -Author: Nils Meyer Regensburg University - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along -with this program; if not, write to the Free Software Foundation, Inc., -51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -See the full license in the file "LICENSE" in the top level distribution -directory -*************************************************************************************/ -/* END LEGAL */ -#include -#include -#include - -#ifndef AVX512 -#ifndef QPX -#ifndef A64FX -#ifndef A64FXFIXEDSIZE -#include -#endif -#endif -#endif -#endif - -NAMESPACE_BEGIN(Grid); - -#include "impl.h" -template class WilsonKernels; - -NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/impl.h b/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/impl.h deleted file mode 100644 index 7eb490db..00000000 --- a/Grid/qcd/action/fermion/instantiation/ZWilsonImplFH/impl.h +++ /dev/null @@ -1 +0,0 @@ -#define IMPLEMENTATION ZWilsonImplFH diff --git a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh index 72a9eaf9..d7553cdb 100755 --- a/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh +++ b/Grid/qcd/action/fermion/instantiation/generate_instantiations.sh @@ -9,8 +9,6 @@ STAG5_IMPL_LIST="" WILSON_IMPL_LIST=" \ WilsonImplF \ WilsonImplD \ - WilsonImplFH \ - WilsonImplDF \ WilsonAdjImplF \ WilsonAdjImplD \ WilsonTwoIndexSymmetricImplF \ @@ -18,26 +16,17 @@ WILSON_IMPL_LIST=" \ WilsonTwoIndexAntiSymmetricImplF \ WilsonTwoIndexAntiSymmetricImplD \ GparityWilsonImplF \ - GparityWilsonImplD \ - GparityWilsonImplFH \ - GparityWilsonImplDF" + GparityWilsonImplD " DWF_IMPL_LIST=" \ WilsonImplF \ WilsonImplD \ - WilsonImplFH \ - WilsonImplDF \ ZWilsonImplF \ - ZWilsonImplD \ - ZWilsonImplFH \ - ZWilsonImplDF " + ZWilsonImplD " GDWF_IMPL_LIST=" \ GparityWilsonImplF \ - GparityWilsonImplD \ - GparityWilsonImplFH \ - GparityWilsonImplDF" - + GparityWilsonImplD " IMPL_LIST="$STAG_IMPL_LIST $WILSON_IMPL_LIST $DWF_IMPL_LIST $GDWF_IMPL_LIST" diff --git a/Grid/qcd/spin/Dirac.h b/Grid/qcd/spin/Dirac.h index d03e0939..2f2a9732 100644 --- a/Grid/qcd/spin/Dirac.h +++ b/Grid/qcd/spin/Dirac.h @@ -40,7 +40,7 @@ See the full license in the file "LICENSE" in the top level distribution directo NAMESPACE_BEGIN(Grid); // Dirac algebra adjoint operator (not in to overload other adj) -accelerator_inline Gamma adj(const Gamma &g) +inline Gamma adj(const Gamma &g) { return Gamma (Gamma::adj[g.g]); } @@ -48,7 +48,7 @@ accelerator_inline Gamma adj(const Gamma &g) // Dirac algebra mutliplication operator -accelerator_inline Gamma operator*(const Gamma &g1, const Gamma &g2) +inline Gamma operator*(const Gamma &g1, const Gamma &g2) { return Gamma (Gamma::mul[g1.g][g2.g]); } diff --git a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc b/Grid/serialisation/BaseIO.cc similarity index 55% rename from Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc rename to Grid/serialisation/BaseIO.cc index f0b15e3b..9afc20b3 100644 --- a/Grid/qcd/action/fermion/instantiation/WilsonImplDF/WilsonKernelsInstantiationWilsonImplDF.cc +++ b/Grid/serialisation/BaseIO.cc @@ -2,14 +2,11 @@ Grid physics library, www.github.com/paboyle/Grid -Source file: ./lib/qcd/action/fermion/WilsonKernels.cc +Source file: ./lib/serialisation/BaseIO.h -Copyright (C) 2015, 2020 +Copyright (C) 2015 -Author: Peter Boyle -Author: Peter Boyle -Author: paboyle -Author: Nils Meyer Regensburg University +Author: Michael Marshall This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -25,27 +22,14 @@ You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -See the full license in the file "LICENSE" in the top level distribution -directory +See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ -#include -#include -#include -#ifndef AVX512 -#ifndef QPX -#ifndef A64FX -#ifndef A64FXFIXEDSIZE -#include -#endif -#endif -#endif -#endif +#include -NAMESPACE_BEGIN(Grid); +NAMESPACE_BEGIN(Grid) -#include "impl.h" -template class WilsonKernels; +std::uint64_t EigenIO::EigenResizeCounter(0); -NAMESPACE_END(Grid); +NAMESPACE_END(Grid) diff --git a/Grid/serialisation/BaseIO.h b/Grid/serialisation/BaseIO.h index 49406201..25481301 100644 --- a/Grid/serialisation/BaseIO.h +++ b/Grid/serialisation/BaseIO.h @@ -9,6 +9,7 @@ Author: Antonin Portelli Author: Peter Boyle Author: Guido Cossu +Author: Michael Marshall This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -30,6 +31,7 @@ Author: Guido Cossu #ifndef GRID_SERIALISATION_ABSTRACT_READER_H #define GRID_SERIALISATION_ABSTRACT_READER_H +#include #include #include #include @@ -110,6 +112,10 @@ namespace Grid { template inline typename std::enable_if::value, typename Traits::scalar_type *>::type getFirstScalar(ET &eigenTensor) { return eigenTensor.data()->begin(); } + + // Counter for resized EigenTensors (poor man's substitute for allocator) + // Defined in BinaryIO.cc + extern std::uint64_t EigenResizeCounter; } // Abstract writer/reader classes //////////////////////////////////////////// @@ -497,8 +503,14 @@ namespace Grid { typename std::enable_if::value, void>::type Reader::Reshape(ETensor &t, const std::array &dims ) { +#ifdef GRID_OMP + // The memory counter is the reason this must be done from the primary thread + assert(omp_in_parallel()==0 && "Deserialisation which resizes Eigen tensor must happen from primary thread"); +#endif + EigenIO::EigenResizeCounter -= static_cast(t.size()) * sizeof(typename ETensor::Scalar); //t.reshape( dims ); t.resize( dims ); + EigenIO::EigenResizeCounter += static_cast(t.size()) * sizeof(typename ETensor::Scalar); } template diff --git a/Grid/serialisation/Hdf5IO.cc b/Grid/serialisation/Hdf5IO.cc index 77396809..db78df8e 100644 --- a/Grid/serialisation/Hdf5IO.cc +++ b/Grid/serialisation/Hdf5IO.cc @@ -1,8 +1,39 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./Grid/serialisation/VectorUtils.h + + Copyright (C) 2015 + + Author: Antonin Portelli + Author: Peter Boyle + Author: Guido Cossu + Author: Michael Marshall + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + #include using namespace Grid; #ifndef H5_NO_NAMESPACE -using namespace H5NS; +using namespace H5NS; // Compile error here? Try adding --enable-cxx to hdf5 configure #endif // Writer implementation /////////////////////////////////////////////////////// diff --git a/Grid/serialisation/Hdf5IO.h b/Grid/serialisation/Hdf5IO.h index 19537599..ae5e740b 100644 --- a/Grid/serialisation/Hdf5IO.h +++ b/Grid/serialisation/Hdf5IO.h @@ -1,3 +1,34 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./Grid/serialisation/VectorUtils.h + + Copyright (C) 2015 + + Author: Peter Boyle + Author: Antonin Portelli + Author: Guido Cossu + Author: Michael Marshall + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + #ifndef GRID_SERIALISATION_HDF5_H #define GRID_SERIALISATION_HDF5_H @@ -9,10 +40,6 @@ #include #include "Hdf5Type.h" -#ifndef H5_NO_NAMESPACE -#define H5NS H5 -#endif - // default thresold above which datasets are used instead of attributes #ifndef HDF5_DEF_DATASET_THRES #define HDF5_DEF_DATASET_THRES 6u @@ -34,11 +61,13 @@ namespace Grid template void writeDefault(const std::string &s, const U &x); template - typename std::enable_if>::is_number, void>::type + void writeRagged(const std::string &s, const std::vector &x); + template + typename std::enable_if>::value>::type writeDefault(const std::string &s, const std::vector &x); template - typename std::enable_if>::is_number, void>::type - writeDefault(const std::string &s, const std::vector &x); + typename std::enable_if>::value>::type + writeDefault(const std::string &s, const std::vector &x) { writeRagged(s, x); } template void writeMultiDim(const std::string &s, const std::vector & Dimensions, const U * pDataRowMajor, size_t NumElements); H5NS::Group & getGroup(void); @@ -64,11 +93,13 @@ namespace Grid template void readDefault(const std::string &s, U &output); template - typename std::enable_if>::is_number, void>::type + void readRagged(const std::string &s, std::vector &x); + template + typename std::enable_if>::value>::type readDefault(const std::string &s, std::vector &x); template - typename std::enable_if>::is_number, void>::type - readDefault(const std::string &s, std::vector &x); + typename std::enable_if>::value>::type + readDefault(const std::string &s, std::vector &x) { readRagged(s, x); } template void readMultiDim(const std::string &s, std::vector &buf, std::vector &dim); H5NS::Group & getGroup(void); @@ -176,24 +207,30 @@ namespace Grid } template - typename std::enable_if>::is_number, void>::type + typename std::enable_if>::value>::type Hdf5Writer::writeDefault(const std::string &s, const std::vector &x) { - // alias to element type - typedef typename element>::type Element; - - // flatten the vector and getting dimensions - Flatten> flat(x); - std::vector dim; - const auto &flatx = flat.getFlatVector(); - for (auto &d: flat.getDim()) - dim.push_back(d); - writeMultiDim(s, dim, &flatx[0], flatx.size()); + if (isRegularShape(x)) + { + // alias to element type + using Scalar = typename is_flattenable>::type; + + // flatten the vector and getting dimensions + Flatten> flat(x); + std::vector dim; + const auto &flatx = flat.getFlatVector(); + for (auto &d: flat.getDim()) + dim.push_back(d); + writeMultiDim(s, dim, &flatx[0], flatx.size()); + } + else + { + writeRagged(s, x); + } } template - typename std::enable_if>::is_number, void>::type - Hdf5Writer::writeDefault(const std::string &s, const std::vector &x) + void Hdf5Writer::writeRagged(const std::string &s, const std::vector &x) { push(s); writeSingleAttribute(x.size(), HDF5_GRID_GUARD "vector_size", @@ -229,7 +266,7 @@ namespace Grid void Hdf5Reader::readMultiDim(const std::string &s, std::vector &buf, std::vector &dim) { // alias to element type - typedef typename element>::type Element; + using Scalar = typename is_flattenable>::type; // read the dimensions H5NS::DataSpace dataSpace; @@ -260,37 +297,44 @@ namespace Grid H5NS::DataSet dataSet; dataSet = group_.openDataSet(s); - dataSet.read(buf.data(), Hdf5Type::type()); + dataSet.read(buf.data(), Hdf5Type::type()); } else { H5NS::Attribute attribute; attribute = group_.openAttribute(s); - attribute.read(Hdf5Type::type(), buf.data()); + attribute.read(Hdf5Type::type(), buf.data()); } } template - typename std::enable_if>::is_number, void>::type + typename std::enable_if>::value>::type Hdf5Reader::readDefault(const std::string &s, std::vector &x) { - // alias to element type - typedef typename element>::type Element; + if (H5Lexists (group_.getId(), s.c_str(), H5P_DEFAULT) > 0 + && H5Aexists_by_name(group_.getId(), s.c_str(), HDF5_GRID_GUARD "vector_size", H5P_DEFAULT ) > 0) + { + readRagged(s, x); + } + else + { + // alias to element type + using Scalar = typename is_flattenable>::type; - std::vector dim; - std::vector buf; - readMultiDim( s, buf, dim ); + std::vector dim; + std::vector buf; + readMultiDim( s, buf, dim ); - // reconstruct the multidimensional vector - Reconstruct> r(buf, dim); - - x = r.getVector(); + // reconstruct the multidimensional vector + Reconstruct> r(buf, dim); + + x = r.getVector(); + } } template - typename std::enable_if>::is_number, void>::type - Hdf5Reader::readDefault(const std::string &s, std::vector &x) + void Hdf5Reader::readRagged(const std::string &s, std::vector &x) { uint64_t size; diff --git a/Grid/serialisation/Hdf5Type.h b/Grid/serialisation/Hdf5Type.h index 64dda349..d8a0dd22 100644 --- a/Grid/serialisation/Hdf5Type.h +++ b/Grid/serialisation/Hdf5Type.h @@ -5,7 +5,9 @@ #include #include -#ifndef H5_NO_NAMESPACE +#ifdef H5_NO_NAMESPACE +#define H5NS +#else #define H5NS H5 #endif diff --git a/Grid/serialisation/MacroMagic.h b/Grid/serialisation/MacroMagic.h index 0495b91e..de456305 100644 --- a/Grid/serialisation/MacroMagic.h +++ b/Grid/serialisation/MacroMagic.h @@ -118,13 +118,13 @@ static inline std::string SerialisableClassName(void) {return std::string(#cname static constexpr bool isEnum = false; \ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))\ template \ -static inline void write(Writer &WR,const std::string &s, const cname &obj){ \ +static inline void write(::Grid::Writer &WR,const std::string &s, const cname &obj){ \ push(WR,s);\ GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \ pop(WR);\ }\ template \ -static inline void read(Reader &RD,const std::string &s, cname &obj){ \ +static inline void read(::Grid::Reader &RD,const std::string &s, cname &obj){ \ if (!push(RD,s))\ {\ std::cout << ::Grid::GridLogWarning << "IO: Cannot open node '" << s << "'" << std::endl; \ diff --git a/Grid/serialisation/VectorUtils.h b/Grid/serialisation/VectorUtils.h index dd5ff0b8..8f490c64 100644 --- a/Grid/serialisation/VectorUtils.h +++ b/Grid/serialisation/VectorUtils.h @@ -9,7 +9,8 @@ Author: Antonin Portelli Author: Peter Boyle Author: paboyle - + Author: Michael Marshall + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or @@ -236,21 +237,36 @@ namespace Grid { } } - // Vector element trait ////////////////////////////////////////////////////// - template - struct element + // is_flattenable::value is true if T is a std::vector<> which can be flattened ////////////////////// + template + struct is_flattenable : std::false_type { - typedef T type; - static constexpr bool is_number = false; + using type = T; + using grid_type = T; + static constexpr int vecRank = 0; + static constexpr bool isGridTensor = false; + static constexpr bool children_flattenable = std::is_arithmetic::value or is_complex::value; }; - + template - struct element> + struct is_flattenable::value>::type> : std::false_type { - typedef typename element::type type; - static constexpr bool is_number = std::is_arithmetic::value - or is_complex::value - or element::is_number; + using type = typename GridTypeMapper::scalar_type; + using grid_type = T; + static constexpr int vecRank = 0; + static constexpr bool isGridTensor = true; + static constexpr bool children_flattenable = true; + }; + + template + struct is_flattenable, typename std::enable_if::children_flattenable>::type> + : std::true_type + { + using type = typename is_flattenable::type; + using grid_type = typename is_flattenable::grid_type; + static constexpr bool isGridTensor = is_flattenable::isGridTensor; + static constexpr int vecRank = is_flattenable::vecRank + 1; + static constexpr bool children_flattenable = true; }; // Vector flattening utility class //////////////////////////////////////////// @@ -259,23 +275,30 @@ namespace Grid { class Flatten { public: - typedef typename element::type Element; + using Scalar = typename is_flattenable::type; + static constexpr bool isGridTensor = is_flattenable::isGridTensor; public: - explicit Flatten(const V &vector); - const V & getVector(void); - const std::vector & getFlatVector(void); - const std::vector & getDim(void); + explicit Flatten(const V &vector); + const V & getVector(void) const { return vector_; } + const std::vector & getFlatVector(void) const { return flatVector_; } + const std::vector & getDim(void) const { return dim_; } private: - void accumulate(const Element &e); - template - void accumulate(const W &v); - void accumulateDim(const Element &e); - template - void accumulateDim(const W &v); + template typename std::enable_if::value && !is_flattenable::isGridTensor>::type + accumulate(const W &e); + template typename std::enable_if::value && is_flattenable::isGridTensor>::type + accumulate(const W &e); + template typename std::enable_if< is_flattenable::value>::type + accumulate(const W &v); + template typename std::enable_if::value && !is_flattenable::isGridTensor>::type + accumulateDim(const W &e) {} // Innermost is a scalar - do nothing + template typename std::enable_if::value && is_flattenable::isGridTensor>::type + accumulateDim(const W &e); + template typename std::enable_if< is_flattenable::value>::type + accumulateDim(const W &v); private: - const V &vector_; - std::vector flatVector_; - std::vector dim_; + const V &vector_; + std::vector flatVector_; + std::vector dim_; }; // Class to reconstruct a multidimensional std::vector @@ -283,38 +306,57 @@ namespace Grid { class Reconstruct { public: - typedef typename element::type Element; + using Scalar = typename is_flattenable::type; + static constexpr bool isGridTensor = is_flattenable::isGridTensor; public: - Reconstruct(const std::vector &flatVector, + Reconstruct(const std::vector &flatVector, const std::vector &dim); - const V & getVector(void); - const std::vector & getFlatVector(void); - const std::vector & getDim(void); + const V & getVector(void) const { return vector_; } + const std::vector & getFlatVector(void) const { return flatVector_; } + const std::vector & getDim(void) const { return dim_; } private: - void fill(std::vector &v); - template - void fill(W &v); - void resize(std::vector &v, const unsigned int dim); - template - void resize(W &v, const unsigned int dim); + template typename std::enable_if::value && !is_flattenable::isGridTensor>::type + fill(W &v); + template typename std::enable_if::value && is_flattenable::isGridTensor>::type + fill(W &v); + template typename std::enable_if< is_flattenable::value>::type + fill(W &v); + template typename std::enable_if< is_flattenable::value && is_flattenable::vecRank==1>::type + resize(W &v, const unsigned int dim); + template typename std::enable_if< is_flattenable::value && (is_flattenable::vecRank>1)>::type + resize(W &v, const unsigned int dim); + template typename std::enable_if::isGridTensor>::type + checkInnermost(const W &e) {} // Innermost is a scalar - do nothing + template typename std::enable_if< is_flattenable::isGridTensor>::type + checkInnermost(const W &e); private: - V vector_; - const std::vector &flatVector_; - std::vector dim_; - size_t ind_{0}; - unsigned int dimInd_{0}; + V vector_; + const std::vector &flatVector_; + std::vector dim_; + size_t ind_{0}; + unsigned int dimInd_{0}; }; // Flatten class template implementation template - void Flatten::accumulate(const Element &e) + template typename std::enable_if::value && !is_flattenable::isGridTensor>::type + Flatten::accumulate(const W &e) { flatVector_.push_back(e); } template - template - void Flatten::accumulate(const W &v) + template typename std::enable_if::value && is_flattenable::isGridTensor>::type + Flatten::accumulate(const W &e) + { + for (const Scalar &x: e) { + flatVector_.push_back(x); + } + } + + template + template typename std::enable_if::value>::type + Flatten::accumulate(const W &v) { for (auto &e: v) { @@ -323,11 +365,17 @@ namespace Grid { } template - void Flatten::accumulateDim(const Element &e) {}; + template typename std::enable_if::value && is_flattenable::isGridTensor>::type + Flatten::accumulateDim(const W &e) + { + using Traits = GridTypeMapper::grid_type>; + for (int rank=0; rank < Traits::Rank; ++rank) + dim_.push_back(Traits::Dimension(rank)); + } template - template - void Flatten::accumulateDim(const W &v) + template typename std::enable_if::value>::type + Flatten::accumulateDim(const W &v) { dim_.push_back(v.size()); accumulateDim(v[0]); @@ -337,42 +385,36 @@ namespace Grid { Flatten::Flatten(const V &vector) : vector_(vector) { - accumulate(vector_); accumulateDim(vector_); - } - - template - const V & Flatten::getVector(void) - { - return vector_; - } - - template - const std::vector::Element> & - Flatten::getFlatVector(void) - { - return flatVector_; - } - - template - const std::vector & Flatten::getDim(void) - { - return dim_; + std::size_t TotalSize{ dim_[0] }; + for (int i = 1; i < dim_.size(); ++i) { + TotalSize *= dim_[i]; + } + flatVector_.reserve(TotalSize); + accumulate(vector_); } // Reconstruct class template implementation template - void Reconstruct::fill(std::vector &v) + template typename std::enable_if::value && !is_flattenable::isGridTensor>::type + Reconstruct::fill(W &v) + { + v = flatVector_[ind_++]; + } + + template + template typename std::enable_if::value && is_flattenable::isGridTensor>::type + Reconstruct::fill(W &v) { for (auto &e: v) { e = flatVector_[ind_++]; } } - + template - template - void Reconstruct::fill(W &v) + template typename std::enable_if::value>::type + Reconstruct::fill(W &v) { for (auto &e: v) { @@ -381,14 +423,15 @@ namespace Grid { } template - void Reconstruct::resize(std::vector &v, const unsigned int dim) + template typename std::enable_if::value && is_flattenable::vecRank==1>::type + Reconstruct::resize(W &v, const unsigned int dim) { v.resize(dim_[dim]); } template - template - void Reconstruct::resize(W &v, const unsigned int dim) + template typename std::enable_if::value && (is_flattenable::vecRank>1)>::type + Reconstruct::resize(W &v, const unsigned int dim) { v.resize(dim_[dim]); for (auto &e: v) @@ -398,34 +441,31 @@ namespace Grid { } template - Reconstruct::Reconstruct(const std::vector &flatVector, + template typename std::enable_if::isGridTensor>::type + Reconstruct::checkInnermost(const W &) + { + using Traits = GridTypeMapper::grid_type>; + const int gridRank{Traits::Rank}; + const int dimRank{static_cast(dim_.size())}; + assert(dimRank >= gridRank && "Tensor rank too low for Grid tensor"); + for (int i=0; i + Reconstruct::Reconstruct(const std::vector &flatVector, const std::vector &dim) : flatVector_(flatVector) , dim_(dim) { + checkInnermost(vector_); + assert(dim_.size() == is_flattenable::vecRank && "Tensor rank doesn't match nested std::vector rank"); resize(vector_, 0); fill(vector_); } - template - const V & Reconstruct::getVector(void) - { - return vector_; - } - - template - const std::vector::Element> & - Reconstruct::getFlatVector(void) - { - return flatVector_; - } - - template - const std::vector & Reconstruct::getDim(void) - { - return dim_; - } - // Vector IO utilities /////////////////////////////////////////////////////// // helper function to read space-separated values template @@ -459,6 +499,64 @@ namespace Grid { return os; } + + // In general, scalar types are considered "flattenable" (regularly shaped) + template + bool isRegularShapeHelper(const std::vector &, std::vector &, int, bool) + { + return true; + } + + template + bool isRegularShapeHelper(const std::vector> &v, std::vector &Dims, int Depth, bool bFirst) + { + if( bFirst) + { + assert( Dims.size() == Depth && "Bug: Delete this message after testing" ); + Dims.push_back(v[0].size()); + if (!Dims[Depth]) + return false; + } + else + { + assert( Dims.size() >= Depth + 1 && "Bug: Delete this message after testing" ); + } + for (std::size_t i = 0; i < v.size(); ++i) + { + if (v[i].size() != Dims[Depth] || !isRegularShapeHelper(v[i], Dims, Depth + 1, bFirst && i==0)) + { + return false; + } + } + return true; + } + + template + bool isRegularShape(const T &t) { return true; } + + template + bool isRegularShape(const std::vector &v) { return !v.empty(); } + + // Return non-zero if all dimensions of this std::vector> are regularly shaped + template + bool isRegularShape(const std::vector> &v) + { + if (v.empty() || v[0].empty()) + return false; + // Make sure all of my rows are the same size + std::vector Dims; + Dims.reserve(is_flattenable::vecRank); + Dims.push_back(v.size()); + Dims.push_back(v[0].size()); + for (std::size_t i = 0; i < Dims[0]; ++i) + { + if (v[i].size() != Dims[1] || !isRegularShapeHelper(v[i], Dims, 2, i==0)) + { + return false; + } + } + return true; + } } // helper function to read space-separated values diff --git a/Grid/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h index 2ce48369..1150b234 100644 --- a/Grid/stencil/SimpleCompressor.h +++ b/Grid/stencil/SimpleCompressor.h @@ -3,20 +3,48 @@ NAMESPACE_BEGIN(Grid); +template +accelerator_inline void exchangeSIMT(vobj &mp0,vobj &mp1,const vobj &vp0,const vobj &vp1,Integer type) +{ + typedef decltype(coalescedRead(mp0)) sobj; + unsigned int Nsimd = vobj::Nsimd(); + unsigned int mask = Nsimd >> (type + 1); + int lane = acceleratorSIMTlane(Nsimd); + int j0 = lane &(~mask); // inner coor zero + int j1 = lane |(mask) ; // inner coor one + const vobj *vpa = &vp0; + const vobj *vpb = &vp1; + const vobj *vp = (lane&mask) ? (vpb) : (vpa); + auto sa = coalescedRead(vp[0],j0); + auto sb = coalescedRead(vp[0],j1); + coalescedWrite(mp0,sa); + coalescedWrite(mp1,sb); +} + template class SimpleCompressor { public: void Point(int) {}; accelerator_inline int CommDatumSize(void) const { return sizeof(vobj); } accelerator_inline bool DecompressionStep(void) const { return false; } - template accelerator_inline void Compress(cobj *buf,int o,const cobj &in) const { buf[o]=in; } + accelerator_inline void Compress(vobj &buf,const vobj &in) const { + coalescedWrite(buf,coalescedRead(in)); + } accelerator_inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o) const { +#ifdef GRID_SIMT + exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); +#else exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); +#endif } accelerator_inline void Decompress(vobj *out,vobj *in, int o) const { assert(0); } accelerator_inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in, - int j,int k, int m,int type) const { + int j,int k, int m,int type) const { +#ifdef GRID_SIMT + exchangeSIMT(out0[j],out1[j],in[k],in[m],type); +#else exchange(out0[j],out1[j],in[k],in[m],type); +#endif } // For cshift. Cshift should drop compressor coupling altogether // because I had to decouple the code from the Stencil anyway diff --git a/Grid/stencil/Stencil.cc b/Grid/stencil/Stencil.cc index 5b1bb2ea..c1b33baa 100644 --- a/Grid/stencil/Stencil.cc +++ b/Grid/stencil/Stencil.cc @@ -30,7 +30,7 @@ NAMESPACE_BEGIN(Grid); void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, - int off,Vector > & table) + int off,std::vector > & table) { table.resize(0); diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 58cebed3..fb01abbb 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -57,27 +57,22 @@ NAMESPACE_BEGIN(Grid); /////////////////////////////////////////////////////////////////// void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, - int off,Vector > & table); + int off,std::vector > & table); template -void Gather_plane_simple_table (Vector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline)); +void Gather_plane_simple_table (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline)); template -void Gather_plane_simple_table (Vector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) +void Gather_plane_simple_table (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so) { int num=table.size(); std::pair *table_v = & table[0]; auto rhs_v = rhs.View(AcceleratorRead); accelerator_forNB( i,num, vobj::Nsimd(), { - typedef decltype(coalescedRead(buffer[0])) compressed_t; - compressed_t tmp_c; - uint64_t o = table_v[i].first; - compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second)); - coalescedWrite(buffer[off+o],tmp_c); + compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]); }); rhs_v.ViewClose(); -// Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table } /////////////////////////////////////////////////////////////////// @@ -85,10 +80,10 @@ void Gather_plane_simple_table (Vector >& table,const Lattice /////////////////////////////////////////////////////////////////// template void Gather_plane_exchange_table(const Lattice &rhs, - Vector pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline)); + commVector pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline)); template -void Gather_plane_exchange_table(Vector >& table,const Lattice &rhs, +void Gather_plane_exchange_table(commVector >& table,const Lattice &rhs, Vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type) { @@ -100,7 +95,7 @@ void Gather_plane_exchange_table(Vector >& table,const Lattic auto p0=&pointers[0][0]; auto p1=&pointers[1][0]; auto tp=&table[0]; - accelerator_forNB(j, num, 1, { + accelerator_forNB(j, num, vobj::Nsimd(), { compress.CompressExchange(p0,p1, &rhs_v[0], j, so+tp[2*j ].second, so+tp[2*j+1].second, @@ -266,10 +261,11 @@ public: } int face_table_computed; - std::vector > > face_table ; + std::vector > > face_table ; Vector surface_list; stencilVector _entries; // Resident in managed memory + commVector _entries_device; // Resident in managed memory std::vector Packets; std::vector Mergers; std::vector MergersSHM; @@ -330,21 +326,8 @@ public: int xmit_to_rank; if ( ! comm_dim ) return 1; - - int nbr_proc; - if (displacement>0) nbr_proc = 1; - else nbr_proc = pd-1; - - // FIXME this logic needs to be sorted for three link term - // assert( (displacement==1) || (displacement==-1)); - // Present hack only works for >= 4^4 subvol per node - _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - - void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p); - - if ( shm==NULL ) return 0; - - return 1; + if ( displacement == 0 ) return 1; + return 0; } ////////////////////////////////////////// @@ -609,13 +592,14 @@ public: template void CommsMerge(decompressor decompress,std::vector &mm,std::vector &dd) { + mergetime-=usecond(); for(int i=0;iCheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - int shm_receive_only = 1; for(int x=0;x_ostride[dimension]; // base offset for start of plane if ( !face_table_computed ) { face_table.resize(face_idx+1); - Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); + std::vector > face_table_host ; + Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host); + face_table[face_idx].resize(face_table_host.size()); + acceleratorCopyToDevice(&face_table_host[0], + &face_table[face_idx][0], + face_table[face_idx].size()*sizeof(face_table_host[0])); } // int rank = _grid->_processor; @@ -1050,10 +1038,6 @@ public: assert (xmit_to_rank != _grid->ThisRank()); assert (recv_from_rank != _grid->ThisRank()); - ///////////////////////////////////////////////////////// - // try the direct copy if possible - ///////////////////////////////////////////////////////// - cobj *send_buf; cobj *recv_buf; if ( compress.DecompressionStep() ) { recv_buf=u_simd_recv_buf[0]; @@ -1061,52 +1045,36 @@ public: recv_buf=this->u_recv_buf_p; } - send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf); - if ( send_buf==NULL ) { - send_buf = this->u_send_buf_p; - } - - // Find out if we get the direct copy. - void *success = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_send_buf_p); - if (success==NULL) { - // we found a packet that comes from MPI and contributes to this leg of stencil - shm_receive_only = 0; - } + cobj *send_buf; + send_buf = this->u_send_buf_p; // Gather locally, must send + //////////////////////////////////////////////////////// + // Gather locally + //////////////////////////////////////////////////////// gathertime-=usecond(); assert(send_buf!=NULL); - Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++; + Gather_plane_simple_table(face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++; gathertime+=usecond(); + /////////////////////////////////////////////////////////// + // Build a list of things to do after we synchronise GPUs + // Start comms now??? + /////////////////////////////////////////////////////////// + AddPacket((void *)&send_buf[u_comm_offset], + (void *)&recv_buf[u_comm_offset], + xmit_to_rank, + recv_from_rank, + bytes); + if ( compress.DecompressionStep() ) { - - if ( shm_receive_only ) { // Early decompress before MPI is finished is possible - AddDecompress(&this->u_recv_buf_p[u_comm_offset], - &recv_buf[u_comm_offset], - words,DecompressionsSHM); - } else { // Decompress after MPI is finished - AddDecompress(&this->u_recv_buf_p[u_comm_offset], - &recv_buf[u_comm_offset], - words,Decompressions); - } - - AddPacket((void *)&send_buf[u_comm_offset], - (void *)&recv_buf[u_comm_offset], - xmit_to_rank, - recv_from_rank, - bytes); - - } else { - AddPacket((void *)&send_buf[u_comm_offset], - (void *)&this->u_recv_buf_p[u_comm_offset], - xmit_to_rank, - recv_from_rank, - bytes); + AddDecompress(&this->u_recv_buf_p[u_comm_offset], + &recv_buf[u_comm_offset], + words,Decompressions); } u_comm_offset+=words; } } - return shm_receive_only; + return 0; } template @@ -1157,7 +1125,6 @@ public: int sshift= _grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); // loop over outer coord planes orthog to dim - int shm_receive_only = 1; for(int x=0;x= rd ); @@ -1172,11 +1139,18 @@ public: if ( !face_table_computed ) { face_table.resize(face_idx+1); - Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table[face_idx]); + std::vector > face_table_host ; + + Gather_plane_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,face_table_host); + face_table[face_idx].resize(face_table_host.size()); + acceleratorCopyToDevice(&face_table_host[0], + &face_table[face_idx][0], + face_table[face_idx].size()*sizeof(face_table_host[0])); } gathermtime-=usecond(); - Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); face_idx++; + Gather_plane_exchange_table(face_table[face_idx],rhs,spointers,dimension,sx,cbmask,compress,permute_type); + face_idx++; gathermtime+=usecond(); //spointers[0] -- low @@ -1205,20 +1179,7 @@ public: _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - // shm == receive pointer if offnode - // shm == Translate[send pointer] if on node -- my view of his send pointer - cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp); - if (shm==NULL) { - shm = rp; - // we found a packet that comes from MPI and contributes to this shift. - // is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil. - // Kernel will add the exterior_terms except if is_same_node. - shm_receive_only = 0; - // leg of stencil - } - // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node - // assuming above pointer flip - rpointers[i] = shm; + rpointers[i] = rp; AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes); @@ -1230,102 +1191,17 @@ public: } } - if ( shm_receive_only ) { - AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,MergersSHM); - } else { - AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); - } + AddMerge(&this->u_recv_buf_p[u_comm_offset],rpointers,reduced_buffer_size,permute_type,Mergers); u_comm_offset +=buffer_size; } } - return shm_receive_only; + return 0; } - void ZeroCounters(void) { - gathertime = 0.; - commtime = 0.; - mpi3synctime=0.; - mpi3synctime_g=0.; - shmmergetime=0.; - for(int i=0;i_npoints;i++){ - comm_time_thr[i]=0; - comm_bytes_thr[i]=0; - comm_enter_thr[i]=0; - comm_leave_thr[i]=0; - shm_bytes_thr[i]=0; - } - halogtime = 0.; - mergetime = 0.; - decompresstime = 0.; - gathermtime = 0.; - splicetime = 0.; - nosplicetime = 0.; - comms_bytes = 0.; - shm_bytes = 0.; - calls = 0.; - }; + void ZeroCounters(void) { }; - void Report(void) { -#define AVERAGE(A) -#define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<_Nprocessors; - RealD NN = _grid->NodeCount(); - double t = 0; - // if comm_time_thr is set they were all done in parallel so take the max - // but add up the bytes - int threaded = 0 ; - for (int i = 0; i < 8; ++i) { - if ( comm_time_thr[i]>0.0 ) { - threaded = 1; - comms_bytes += comm_bytes_thr[i]; - shm_bytes += shm_bytes_thr[i]; - if (t < comm_time_thr[i]) t = comm_time_thr[i]; - } - } - if (threaded) commtime += t; - - _grid->GlobalSum(commtime); commtime/=NP; - if ( calls > 0. ) { - std::cout << GridLogMessage << " Stencil calls "<1.0){ - PRINTIT(comms_bytes); - PRINTIT(commtime); - std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<1.0){ - PRINTIT(shm_bytes); // X bytes + R bytes - // Double this to include spin projection overhead with 2:1 ratio in wilson - auto gatheralltime = gathertime+gathermtime; - std::cout << GridLogMessage << " Stencil SHM " << (shm_bytes)/gatheralltime/1000. << " GB/s per rank"< local {nt,1,nsimd}; \ cl::sycl::range<3> global{unum1,unum2,nsimd}; \ - cgh.parallel_for( \ + cgh.parallel_for( \ cl::sycl::nd_range<3>(global,local), \ [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ [[intel::reqd_sub_group_size(8)]] \ @@ -289,7 +294,10 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; -inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} +inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { + theGridAccelerator->memcpy(to,from,bytes); +} +inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); } inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();} @@ -394,7 +402,8 @@ inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} -inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} +inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} +inline void acceleratorCopySynchronise(void) { } inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);} #endif @@ -435,7 +444,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} -inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} +inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} +inline void acceleratorCopySynchronise(void) {}; inline int acceleratorIsCommunicable(void *ptr){ return 1; } inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index ab2d2399..697f3ac1 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -301,6 +301,13 @@ void Grid_init(int *argc,char ***argv) GlobalSharedMemory::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL; } + if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-mpi") ){ + int forcempi; + arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm-mpi"); + GridCmdOptionInt(arg,forcempi); + Stencil_force_mpi = (bool)forcempi; + } + if( GridCmdOptionExists(*argv,*argv+*argc,"--device-mem") ){ int MB; arg= GridCmdOptionPayload(*argv,*argv+*argc,"--device-mem"); @@ -419,7 +426,9 @@ void Grid_init(int *argc,char ***argv) std::cout<Barrier(); - DwH.ZeroCounters(); - DwH.Dhop(src,result,0); - double t0=usecond(); - for(int i=0;iBarrier(); - - double volume=Ls; for(int mu=0;muBarrier(); diff --git a/benchmarks/Benchmark_gparity.cc b/benchmarks/Benchmark_gparity.cc index 2045d650..ce84ecbc 100644 --- a/benchmarks/Benchmark_gparity.cc +++ b/benchmarks/Benchmark_gparity.cc @@ -2,7 +2,6 @@ #include using namespace std; using namespace Grid; - ; template struct scal { @@ -118,30 +117,6 @@ int main (int argc, char ** argv) Dw.Report(); } - std::cout << GridLogMessage<< "* SINGLE/HALF"<Barrier(); - DwH.ZeroCounters(); - DwH.Dhop(src,result,0); - double t0=usecond(); - for(int i=0;iBarrier(); - - double volume=Ls; for(int mu=0;mu struct is_tensor : std::integral_constant, T>::value> {}; + // Is this an Eigen tensor of a supported scalar + template struct is_tensor_of_scalar : public std::false_type {}; + template struct is_tensor_of_scalar::value && is_scalar::value>::type> : public std::true_type {}; + // Is this an Eigen tensor of a supported container + template struct is_tensor_of_container : public std::false_type {}; + template struct is_tensor_of_container::value && isGridTensor::value>::type> : public std::true_type {}; + + +Eigen tensors are regular, multidimensional objects, and each Reader/Writer +was extended to support this new datatype. Where the Eigen tensor contains +a Grid tensor, the dimensions of the data written are the dimensions of the +Eigen tensor plus the dimensions of the underlying Grid scalar. Dimensions +of size 1 are preserved. + +**New Reader/Writer methods for multi-dimensional data**:: + + template + void readMultiDim(const std::string &s, std::vector &buf, std::vector &dim); + template + void writeMultiDim(const std::string &s, const std::vector & Dimensions, const U * pDataRowMajor, size_t NumElements); + + +On readback, the Eigen tensor rank must match the data being read, but the tensor +dimensions will be resized if necessary. Resizing is not possible for Eigen::TensorMap +because these tensors use a buffer provided at construction, and this buffer cannot be changed. +Deserialisation failures cause Grid to assert. + + +HDF5 Optimisations -- added June 2021 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Grid serialisation is intended to be light, deterministic and provide a layer of abstraction over +multiple file formats. HDF5 excels at handling multi-dimensional data, and the Grid HDF5Reader/HDF5Writer exploits this. +When serialising nested ``std::vector``, where ``T`` is an arithmetic or complex type, +the Hdf5Writer writes the data as an Hdf5 DataSet object. + +However, nested ``std::vector>`` might be "ragged", i.e. not necessarily regular. E.g. a 3d nested +``std::vector`` might contain 2 rows, the first being a 2x2 block and the second row being a 1 x 2 block. +A bug existed whereby this was not checked on write, so nested, ragged vectors +were written as a regular dataset, with a buffer under/overrun and jumbled contents. + +Clearly this was not used in production, as the bug went undetected until now. Fixing this bug +is an opportunity to further optimise the HDF5 file format. + +The goals of this change are to: + +* Make changes to the Hdf5 file format only -- i.e. do not impact other file formats + +* Implement file format changes in such a way that they are transparent to the Grid reader + +* Correct the bug for ragged vectors of numeric / complex types + +* Extend the support of nested std::vector to arbitrarily nested Grid tensors + + +The trait class ``element`` has been redefined to ``is_flattenable``, which is a trait class for +potentially "flattenable" objects. These are (possibly nested) ``std::vector`` where ``T`` is +an arithmetic, complex or Grid tensor type. Flattenable objects are tested on write +(with the function ``isRegularShape``) to see whether they actually are regular. + +Flattenable, regular objects are written to a multidimensional HDF5 DataSet. +Otherwise, an Hdf5 sub group is created with the object "name", and each element of the outer dimension is +recursively written to as object "name_n", where n is a 0-indexed number. + +On readback (by Grid)), the presence of a subgroup containing the attribute ``Grid_vector_size`` triggers a +"ragged read", otherwise a read from a DataSet is attempted. + + Data parallel field IO ----------------------- diff --git a/examples/Example_wall_wall_spectrum.cc b/examples/Example_wall_wall_spectrum.cc new file mode 100644 index 00000000..0d70f351 --- /dev/null +++ b/examples/Example_wall_wall_spectrum.cc @@ -0,0 +1,404 @@ +/* + * Warning: This code illustrative only: not well tested, and not meant for production use + * without regression / tests being applied + */ + +#include + +using namespace std; +using namespace Grid; +typedef SpinColourMatrix Propagator; +typedef SpinColourVector Fermion; + +template class CovariantLaplacianCshift : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + GridBase *grid; + GaugeField U; + + CovariantLaplacianCshift(GaugeField &_U) : + grid(_U.Grid()), + U(_U) { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out=Zero(); + for(int mu=0;mu(U, mu); // NB: Inefficent + out = out - Gimpl::CovShiftForward(Umu,mu,in); + out = out - Gimpl::CovShiftBackward(Umu,mu,in); + out = out + 2.0*in; + } + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + +void MakePhase(Coordinate mom,LatticeComplex &phase) +{ + GridBase *grid = phase.Grid(); + auto latt_size = grid->GlobalDimensions(); + ComplexD ci(0.0,1.0); + phase=Zero(); + + LatticeComplex coor(phase.Grid()); + for(int mu=0;mu::avgPlaquette(U); + + std::cout << " Initial plaquette "<::SteepestDescentGaugeFix(Ufix,xform,alpha,10000,1.0e-12, 1.0e-12,true,orthog); + + plaq=WilsonLoops::avgPlaquette(Ufix); + + std::cout << " Final plaquette "< +void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) +{ + typedef CovariantLaplacianCshift Laplacian_t; + Laplacian_t Laplacian(U); + + Integer Iterations = 40; + Real width = 2.0; + Real coeff = (width*width) / Real(4*Iterations); + + Field tmp(U.Grid()); + smeared=unsmeared; + // chi = (1-p^2/2N)^N kronecker + for(int n = 0; n < Iterations; ++n) { + Laplacian.M(smeared,tmp); + smeared = smeared - coeff*tmp; + std::cout << " smear iter " << n<<" " < +void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) +{ + GridBase *UGrid = D.GaugeGrid(); + GridBase *FGrid = D.FermionGrid(); + + LatticeFermion src4 (UGrid); + LatticeFermion src5 (FGrid); + LatticeFermion result5(FGrid); + LatticeFermion result4(UGrid); + + ConjugateGradient CG(1.0e-8,100000); + SchurRedBlackDiagMooeeSolve schur(CG); + ZeroGuesser ZG; // Could be a DeflatedGuesser if have eigenvectors + for(int s=0;s(src4,source,s,c); + + D.ImportPhysicalFermionSource(src4,src5); + + result5=Zero(); + schur(D,src5,result5,ZG); + std::cout<(propagator,result4,s,c); + } + } +} + +class MesonFile: Serializable { +public: + GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector >, data); +}; + +void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) +{ + const int nchannel=4; + Gamma::Algebra Gammas[nchannel][2] = { + {Gamma::Algebra::Gamma5 ,Gamma::Algebra::Gamma5}, + {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5}, + {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5}, + {Gamma::Algebra::Gamma5 ,Gamma::Algebra::GammaTGamma5} + }; + + Gamma G5(Gamma::Algebra::Gamma5); + + LatticeComplex meson_CF(q1.Grid()); + MesonFile MF; + + for(int ch=0;ch meson_T; + sliceSum(meson_CF,meson_T, Tdir); + + int nt=meson_T.size(); + + std::vector corr(nt); + for(int t=0;t &q1,std::vector &q2) +{ + const int nchannel=4; + Gamma::Algebra Gammas[nchannel][2] = { + {Gamma::Algebra::Gamma5 ,Gamma::Algebra::Gamma5}, + {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5}, + {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5}, + {Gamma::Algebra::Gamma5 ,Gamma::Algebra::GammaTGamma5} + }; + + Gamma G5(Gamma::Algebra::Gamma5); + int nt=q1.size(); + std::vector meson_CF(nt); + MesonFile MF; + + for(int ch=0;ch corr(nt); + for(int t=0;t seeds4({1,2,3,4}); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeGaugeField Umu(UGrid); + LatticeGaugeField Ufixed(UGrid); + std::string config; + if( argc > 1 && argv[1][0] != '-' ) + { + std::cout<::ColdConfiguration(Umu); + // SU::HotConfiguration(RNG4,Umu); + config="HotConfig"; + } + GaugeFix(Umu,Ufixed); + Umu=Ufixed; + + + std::vector masses({ 0.004,0.02477,0.447} ); // u/d, s, c ?? + std::vector M5s ({ 1.8,1.8,1.0} ); + std::vector bs ({ 1.0,1.0,1.5} ); // DDM + std::vector cs ({ 0.0,0.0,0.5} ); // DDM + std::vector Ls_s ({ 16,16,12} ); + std::vector FGrids; + std::vector FrbGrids; + + int nmass = masses.size(); + + std::vector FermActs; + + std::cout< PointProps(nmass,UGrid); + std::vector GaussProps(nmass,UGrid); + std::vector Z2Props (nmass,UGrid); + std::vector GFProps (nmass,UGrid); + + for(int m=0;m > wsnk_z2Props(nmass); + std::vector > wsnk_gfProps(nmass); + for(int m=0;m develop) uncommited changes + +Grid : Message : ================================================ +Grid : Message : MPI is initialised and logging filters activated +Grid : Message : ================================================ +Grid : Message : Requested 2147483648 byte stencil comms buffers +Grid : Message : MemoryManager Cache 34004218675 bytes +Grid : Message : MemoryManager::Init() setting up +Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8 +Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory +Grid : Message : MemoryManager::Init() Using cudaMalloc +Grid : Message : 0.729967 s : Grid is setup to use 4 threads +Grid : Message : 0.729975 s : Number of iterations to average: 250 +Grid : Message : 0.729977 s : ==================================================================================================== +Grid : Message : 0.729978 s : = Benchmarking sequential halo exchange from host memory +Grid : Message : 0.729979 s : ==================================================================================================== +Grid : Message : 0.729980 s : L Ls bytes MB/s uni (err/min/max) MB/s bidi (err/min/max) +Grid : Message : 0.749870 s : 8 8 393216 50783.4 101566.8 +Grid : Message : 0.764282 s : 8 8 393216 54704.5 109409.0 +Grid : Message : 0.780310 s : 8 8 393216 49090.6 98181.3 +Grid : Message : 0.796479 s : 8 8 393216 48662.3 97324.7 +Grid : Message : 0.841551 s : 12 8 1327104 66728.9 133457.8 +Grid : Message : 0.880653 s : 12 8 1327104 67932.9 135865.9 +Grid : Message : 0.920097 s : 12 8 1327104 67304.2 134608.4 +Grid : Message : 0.961444 s : 12 8 1327104 64205.9 128411.8 +Grid : Message : 1.660890 s : 16 8 3145728 67833.1 135666.3 +Grid : Message : 1.153006 s : 16 8 3145728 72416.3 144832.6 +Grid : Message : 1.240962 s : 16 8 3145728 71536.1 143072.2 +Grid : Message : 1.330372 s : 16 8 3145728 70372.7 140745.3 +Grid : Message : 1.519996 s : 20 8 6144000 71017.4 142034.8 +Grid : Message : 1.667745 s : 20 8 6144000 83189.5 166378.9 +Grid : Message : 1.817908 s : 20 8 6144000 81836.5 163673.1 +Grid : Message : 1.969344 s : 20 8 6144000 81148.0 162296.0 +Grid : Message : 2.260249 s : 24 8 10616832 79299.9 158599.8 +Grid : Message : 2.512319 s : 24 8 10616832 84249.2 168498.4 +Grid : Message : 2.763820 s : 24 8 10616832 84430.4 168860.9 +Grid : Message : 3.172850 s : 24 8 10616832 83776.5 167553.1 +Grid : Message : 3.460951 s : 28 8 16859136 82176.6 164353.1 +Grid : Message : 3.859348 s : 28 8 16859136 84642.9 169285.9 +Grid : Message : 4.254351 s : 28 8 16859136 85366.0 170731.9 +Grid : Message : 4.651748 s : 28 8 16859136 84850.2 169700.4 +Grid : Message : 5.302166 s : 32 8 25165824 83402.1 166804.1 +Grid : Message : 5.889123 s : 32 8 25165824 85756.3 171512.6 +Grid : Message : 6.472357 s : 32 8 25165824 86299.1 172598.3 +Grid : Message : 7.572140 s : 32 8 25165824 86059.7 172119.3 +Grid : Message : 7.578700 s : ==================================================================================================== +Grid : Message : 7.578740 s : = Benchmarking sequential halo exchange from GPU memory +Grid : Message : 7.578750 s : ==================================================================================================== +Grid : Message : 7.578760 s : L Ls bytes MB/s uni (err/min/max) MB/s bidi (err/min/max) +Grid : Message : 7.119231 s : 8 8 393216 13844.9 27689.8 +Grid : Message : 7.150661 s : 8 8 393216 25034.4 50068.9 +Grid : Message : 7.173800 s : 8 8 393216 34002.0 68004.0 +Grid : Message : 7.197415 s : 8 8 393216 33317.7 66635.5 +Grid : Message : 7.240696 s : 12 8 1327104 110772.0 221544.0 +Grid : Message : 7.263466 s : 12 8 1327104 116627.5 233254.9 +Grid : Message : 7.310752 s : 12 8 1327104 56142.8 112285.6 +Grid : Message : 7.356881 s : 12 8 1327104 57551.3 115102.6 +Grid : Message : 7.422351 s : 16 8 3145728 167086.0 334172.0 +Grid : Message : 7.458334 s : 16 8 3145728 174903.6 349807.1 +Grid : Message : 7.558746 s : 16 8 3145728 62663.3 125326.6 +Grid : Message : 7.658824 s : 16 8 3145728 62871.8 125743.6 +Grid : Message : 7.741423 s : 20 8 6144000 231840.3 463680.6 +Grid : Message : 7.794862 s : 20 8 6144000 229996.1 459992.1 +Grid : Message : 7.982472 s : 20 8 6144000 65501.1 131002.1 +Grid : Message : 8.170548 s : 20 8 6144000 65338.8 130677.5 +Grid : Message : 8.277182 s : 24 8 10616832 274319.0 548638.0 +Grid : Message : 8.354585 s : 24 8 10616832 274365.1 548730.2 +Grid : Message : 8.675675 s : 24 8 10616832 66132.8 132265.7 +Grid : Message : 8.999237 s : 24 8 10616832 65627.4 131254.7 +Grid : Message : 9.140302 s : 28 8 16859136 300825.0 601650.0 +Grid : Message : 9.251320 s : 28 8 16859136 303749.1 607498.1 +Grid : Message : 9.632241 s : 28 8 16859136 88520.3 177040.6 +Grid : Message : 9.999663 s : 28 8 16859136 91772.9 183545.7 +Grid : Message : 10.183071 s : 32 8 25165824 328325.5 656651.1 +Grid : Message : 10.335093 s : 32 8 25165824 331109.7 662219.3 +Grid : Message : 10.875980 s : 32 8 25165824 93056.0 186111.9 +Grid : Message : 11.418666 s : 32 8 25165824 92747.5 185495.0 +Grid : Message : 11.434792 s : ==================================================================================================== +Grid : Message : 11.434797 s : = All done; Bye Bye +Grid : Message : 11.434798 s : ==================================================================================================== diff --git a/systems/Booster/config-command b/systems/Booster/config-command new file mode 100644 index 00000000..8530c5f9 --- /dev/null +++ b/systems/Booster/config-command @@ -0,0 +1,14 @@ +LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/ +../../configure \ + --enable-comms=mpi \ + --enable-simd=GPU \ + --enable-gen-simd-width=64 \ + --enable-shm=nvlink \ + --enable-accelerator=cuda \ + --with-lime=$LIME \ + --disable-accelerator-cshift \ + --disable-unified \ + CXX=nvcc \ + LDFLAGS="-cudart shared " \ + CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared" + diff --git a/systems/Booster/dwf.16node.perf b/systems/Booster/dwf.16node.perf new file mode 100644 index 00000000..8dcdcaed --- /dev/null +++ b/systems/Booster/dwf.16node.perf @@ -0,0 +1,156 @@ +OPENMPI detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42505273344 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 3 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +SharedMemoryMpi: World communicator of size 64 +SharedMemoryMpi: Node communicator of size 4 +0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14ac40000000 for comms buffers +Setting up IPC + +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|_ | | | | | | | | | | | | _|__ +__|_ _|__ +__|_ GGGG RRRR III DDDD _|__ +__|_ G R R I D D _|__ +__|_ G R R I D D _|__ +__|_ G GG RRRR I D D _|__ +__|_ G G R R I D D _|__ +__|_ GGGG R R III DDDD _|__ +__|_ _|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ + | | | | | | | | | | | | | | + + +Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +Current Grid git commit hash=f660dc67e4b193afc4015bc5e5fe47cfdbb0356e: (HEAD -> develop, origin/develop, origin/HEAD) uncommited changes + +Grid : Message : ================================================ +Grid : Message : MPI is initialised and logging filters activated +Grid : Message : ================================================ +Grid : Message : Requested 2147483648 byte stencil comms buffers +Grid : Message : MemoryManager Cache 34004218675 bytes +Grid : Message : MemoryManager::Init() setting up +Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8 +Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory +Grid : Message : MemoryManager::Init() Using cudaMalloc +Grid : Message : 0.910318 s : Grid Layout +Grid : Message : 0.910320 s : Global lattice size : 64 64 64 256 +Grid : Message : 0.910325 s : OpenMP threads : 4 +Grid : Message : 0.910326 s : MPI tasks : 2 2 2 8 +Grid : Message : 0.973956 s : Making s innermost grids +Grid : Message : 1.198830 s : Initialising 4d RNG +Grid : Message : 1.119813 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 1.119870 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 2.683307 s : Initialising 5d RNG +Grid : Message : 4.220535 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 4.220563 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 37.198140 s : Initialised RNGs +Grid : Message : 39.952612 s : Drawing gauge field +Grid : Message : 40.488019 s : Random gauge initialised +Grid : Message : 42.659220 s : Setting up Cshift based reference +Grid : Message : 47.622210 s : ***************************************************************** +Grid : Message : 47.622236 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 47.622237 s : ***************************************************************** +Grid : Message : 47.622238 s : ***************************************************************** +Grid : Message : 47.622239 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 47.622240 s : * Vectorising space-time by 8 +Grid : Message : 47.622241 s : * VComplexF size is 64 B +Grid : Message : 47.622242 s : * SINGLE precision +Grid : Message : 47.622243 s : * Using Overlapped Comms/Compute +Grid : Message : 47.622244 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 47.622245 s : ***************************************************************** +Grid : Message : 48.950210 s : Called warmup +Grid : Message : 77.311124 s : Called Dw 3000 times in 2.83592e+07 us +Grid : Message : 77.311181 s : mflop/s = 1.49934e+08 +Grid : Message : 77.311184 s : mflop/s per rank = 2.34273e+06 +Grid : Message : 77.311185 s : mflop/s per node = 9.37091e+06 +Grid : Message : 77.311186 s : RF GiB/s (base 2) = 304663 +Grid : Message : 77.311187 s : mem GiB/s (base 2) = 190415 +Grid : Message : 77.314752 s : norm diff 1.03478e-13 +Grid : Message : 77.349587 s : #### Dhop calls report +Grid : Message : 77.349591 s : WilsonFermion5D Number of DhopEO Calls : 6002 +Grid : Message : 77.349613 s : WilsonFermion5D TotalTime /Calls : 4761.53 us +Grid : Message : 77.349615 s : WilsonFermion5D CommTime /Calls : 3363.09 us +Grid : Message : 77.349616 s : WilsonFermion5D FaceTime /Calls : 469.094 us +Grid : Message : 77.349617 s : WilsonFermion5D ComputeTime1/Calls : 26.8794 us +Grid : Message : 77.349618 s : WilsonFermion5D ComputeTime2/Calls : 949.276 us +Grid : Message : 77.349702 s : Average mflops/s per call : 2.68569e+10 +Grid : Message : 77.349710 s : Average mflops/s per call per rank : 4.1964e+08 +Grid : Message : 77.349711 s : Average mflops/s per call per node : 1.67856e+09 +Grid : Message : 77.349712 s : Average mflops/s per call (full) : 1.51538e+08 +Grid : Message : 77.349713 s : Average mflops/s per call per rank (full): 2.36779e+06 +Grid : Message : 77.349714 s : Average mflops/s per call per node (full): 9.47115e+06 +Grid : Message : 77.349715 s : WilsonFermion5D Stencil +Grid : Message : 77.349716 s : WilsonFermion5D StencilEven +Grid : Message : 77.349717 s : WilsonFermion5D StencilOdd +Grid : Message : 77.349718 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 77.349719 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 77.349720 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 104.883719 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 104.883743 s : Called DwDag +Grid : Message : 104.883744 s : norm dag result 12.0421 +Grid : Message : 104.901901 s : norm dag ref 12.0421 +Grid : Message : 104.917822 s : norm dag diff 7.63254e-14 +Grid : Message : 104.957229 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec +Grid : Message : 105.334551 s : src_e0.499998 +Grid : Message : 105.416616 s : src_o0.500002 +Grid : Message : 105.486729 s : ********************************************************* +Grid : Message : 105.486732 s : * Benchmarking DomainWallFermionF::DhopEO +Grid : Message : 105.486733 s : * Vectorising space-time by 8 +Grid : Message : 105.486734 s : * SINGLE precision +Grid : Message : 105.486739 s : * Using Overlapped Comms/Compute +Grid : Message : 105.486740 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 105.486741 s : ********************************************************* +Grid : Message : 119.695464 s : Deo mflop/s = 1.5039e+08 +Grid : Message : 119.695494 s : Deo mflop/s per rank 2.34984e+06 +Grid : Message : 119.695496 s : Deo mflop/s per node 9.39937e+06 +Grid : Message : 119.695502 s : #### Dhop calls report +Grid : Message : 119.695503 s : WilsonFermion5D Number of DhopEO Calls : 3001 +Grid : Message : 119.695505 s : WilsonFermion5D TotalTime /Calls : 4734.45 us +Grid : Message : 119.695507 s : WilsonFermion5D CommTime /Calls : 3287.23 us +Grid : Message : 119.695508 s : WilsonFermion5D FaceTime /Calls : 537.724 us +Grid : Message : 119.695509 s : WilsonFermion5D ComputeTime1/Calls : 16.0483 us +Grid : Message : 119.695510 s : WilsonFermion5D ComputeTime2/Calls : 939.854 us +Grid : Message : 119.695533 s : Average mflops/s per call : 4.50726e+10 +Grid : Message : 119.695535 s : Average mflops/s per call per rank : 7.04259e+08 +Grid : Message : 119.695536 s : Average mflops/s per call per node : 2.81703e+09 +Grid : Message : 119.695537 s : Average mflops/s per call (full) : 1.52405e+08 +Grid : Message : 119.695538 s : Average mflops/s per call per rank (full): 2.38133e+06 +Grid : Message : 119.695539 s : Average mflops/s per call per node (full): 9.52532e+06 +Grid : Message : 119.695540 s : WilsonFermion5D Stencil +Grid : Message : 119.695541 s : WilsonFermion5D StencilEven +Grid : Message : 119.695542 s : WilsonFermion5D StencilOdd +Grid : Message : 119.695543 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 119.695544 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 119.695545 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 119.752707 s : r_e6.02108 +Grid : Message : 119.759448 s : r_o6.02101 +Grid : Message : 119.765382 s : res12.0421 +Grid : Message : 120.419093 s : norm diff 0 +Grid : Message : 120.829772 s : norm diff even 0 +Grid : Message : 120.909078 s : norm diff odd 0 diff --git a/systems/Booster/dwf.4node.perf b/systems/Booster/dwf.4node.perf new file mode 100644 index 00000000..e897fe13 --- /dev/null +++ b/systems/Booster/dwf.4node.perf @@ -0,0 +1,156 @@ +OPENMPI detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42505273344 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 3 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +SharedMemoryMpi: World communicator of size 16 +SharedMemoryMpi: Node communicator of size 4 +0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x14e9c0000000 for comms buffers +Setting up IPC + +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|_ | | | | | | | | | | | | _|__ +__|_ _|__ +__|_ GGGG RRRR III DDDD _|__ +__|_ G R R I D D _|__ +__|_ G R R I D D _|__ +__|_ G GG RRRR I D D _|__ +__|_ G G R R I D D _|__ +__|_ GGGG R R III DDDD _|__ +__|_ _|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ + | | | | | | | | | | | | | | + + +Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +Current Grid git commit hash=e188c0512ebee79bfb15906676af1c9e142aa21a: (HEAD -> develop) uncommited changes + +Grid : Message : ================================================ +Grid : Message : MPI is initialised and logging filters activated +Grid : Message : ================================================ +Grid : Message : Requested 2147483648 byte stencil comms buffers +Grid : Message : MemoryManager Cache 34004218675 bytes +Grid : Message : MemoryManager::Init() setting up +Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8 +Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory +Grid : Message : MemoryManager::Init() Using cudaMalloc +Grid : Message : 0.717713 s : Grid Layout +Grid : Message : 0.717716 s : Global lattice size : 64 64 64 64 +Grid : Message : 0.717724 s : OpenMP threads : 4 +Grid : Message : 0.717725 s : MPI tasks : 2 2 2 2 +Grid : Message : 0.801634 s : Making s innermost grids +Grid : Message : 0.844903 s : Initialising 4d RNG +Grid : Message : 0.940001 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 0.940060 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 1.338368 s : Initialising 5d RNG +Grid : Message : 2.859273 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 2.859304 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 11.140924 s : Initialised RNGs +Grid : Message : 13.433456 s : Drawing gauge field +Grid : Message : 13.955847 s : Random gauge initialised +Grid : Message : 15.528535 s : Setting up Cshift based reference +Grid : Message : 21.484340 s : ***************************************************************** +Grid : Message : 21.484840 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 21.484860 s : ***************************************************************** +Grid : Message : 21.484870 s : ***************************************************************** +Grid : Message : 21.484880 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 21.484890 s : * Vectorising space-time by 8 +Grid : Message : 21.484900 s : * VComplexF size is 64 B +Grid : Message : 21.484910 s : * SINGLE precision +Grid : Message : 21.484920 s : * Using Overlapped Comms/Compute +Grid : Message : 21.484930 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 21.484940 s : ***************************************************************** +Grid : Message : 22.344741 s : Called warmup +Grid : Message : 49.832292 s : Called Dw 3000 times in 2.74873e+07 us +Grid : Message : 49.832358 s : mflop/s = 3.86726e+07 +Grid : Message : 49.832360 s : mflop/s per rank = 2.41704e+06 +Grid : Message : 49.832361 s : mflop/s per node = 9.66814e+06 +Grid : Message : 49.832362 s : RF GiB/s (base 2) = 78581.7 +Grid : Message : 49.832363 s : mem GiB/s (base 2) = 49113.6 +Grid : Message : 49.835924 s : norm diff 1.03481e-13 +Grid : Message : 49.870568 s : #### Dhop calls report +Grid : Message : 49.870574 s : WilsonFermion5D Number of DhopEO Calls : 6002 +Grid : Message : 49.870598 s : WilsonFermion5D TotalTime /Calls : 4616.79 us +Grid : Message : 49.870600 s : WilsonFermion5D CommTime /Calls : 3241.77 us +Grid : Message : 49.870601 s : WilsonFermion5D FaceTime /Calls : 469.006 us +Grid : Message : 49.870602 s : WilsonFermion5D ComputeTime1/Calls : 27.0492 us +Grid : Message : 49.870603 s : WilsonFermion5D ComputeTime2/Calls : 926.33 us +Grid : Message : 49.870614 s : Average mflops/s per call : 6.71631e+09 +Grid : Message : 49.870619 s : Average mflops/s per call per rank : 4.19769e+08 +Grid : Message : 49.870621 s : Average mflops/s per call per node : 1.67908e+09 +Grid : Message : 49.870626 s : Average mflops/s per call (full) : 3.90723e+07 +Grid : Message : 49.870627 s : Average mflops/s per call per rank (full): 2.44202e+06 +Grid : Message : 49.870628 s : Average mflops/s per call per node (full): 9.76808e+06 +Grid : Message : 49.870629 s : WilsonFermion5D Stencil +Grid : Message : 49.870630 s : WilsonFermion5D StencilEven +Grid : Message : 49.870631 s : WilsonFermion5D StencilOdd +Grid : Message : 49.870632 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 49.870633 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 49.870634 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 77.321890 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 77.321911 s : Called DwDag +Grid : Message : 77.321912 s : norm dag result 12.0421 +Grid : Message : 77.334619 s : norm dag ref 12.0421 +Grid : Message : 77.350515 s : norm dag diff 7.63236e-14 +Grid : Message : 77.389923 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec +Grid : Message : 77.769815 s : src_e0.499997 +Grid : Message : 77.847560 s : src_o0.500003 +Grid : Message : 77.917493 s : ********************************************************* +Grid : Message : 77.917496 s : * Benchmarking DomainWallFermionF::DhopEO +Grid : Message : 77.917497 s : * Vectorising space-time by 8 +Grid : Message : 77.917498 s : * SINGLE precision +Grid : Message : 77.917499 s : * Using Overlapped Comms/Compute +Grid : Message : 77.917500 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 77.917501 s : ********************************************************* +Grid : Message : 91.412946 s : Deo mflop/s = 3.95925e+07 +Grid : Message : 91.412978 s : Deo mflop/s per rank 2.47453e+06 +Grid : Message : 91.412980 s : Deo mflop/s per node 9.89813e+06 +Grid : Message : 91.412983 s : #### Dhop calls report +Grid : Message : 91.412984 s : WilsonFermion5D Number of DhopEO Calls : 3001 +Grid : Message : 91.412986 s : WilsonFermion5D TotalTime /Calls : 4496.84 us +Grid : Message : 91.412988 s : WilsonFermion5D CommTime /Calls : 3057.28 us +Grid : Message : 91.412989 s : WilsonFermion5D FaceTime /Calls : 528.499 us +Grid : Message : 91.412990 s : WilsonFermion5D ComputeTime1/Calls : 16.1939 us +Grid : Message : 91.412991 s : WilsonFermion5D ComputeTime2/Calls : 942.557 us +Grid : Message : 91.413021 s : Average mflops/s per call : 1.12574e+10 +Grid : Message : 91.413023 s : Average mflops/s per call per rank : 7.03586e+08 +Grid : Message : 91.413024 s : Average mflops/s per call per node : 2.81434e+09 +Grid : Message : 91.413025 s : Average mflops/s per call (full) : 4.01145e+07 +Grid : Message : 91.413026 s : Average mflops/s per call per rank (full): 2.50716e+06 +Grid : Message : 91.413027 s : Average mflops/s per call per node (full): 1.00286e+07 +Grid : Message : 91.413028 s : WilsonFermion5D Stencil +Grid : Message : 91.413029 s : WilsonFermion5D StencilEven +Grid : Message : 91.413030 s : WilsonFermion5D StencilOdd +Grid : Message : 91.413031 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 91.413032 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 91.413033 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 91.470394 s : r_e6.02111 +Grid : Message : 91.476539 s : r_o6.02102 +Grid : Message : 91.482442 s : res12.0421 +Grid : Message : 92.138799 s : norm diff 0 +Grid : Message : 92.545354 s : norm diff even 0 +Grid : Message : 92.619444 s : norm diff odd 0 diff --git a/systems/Booster/dwf16.slurm b/systems/Booster/dwf16.slurm new file mode 100644 index 00000000..e7729447 --- /dev/null +++ b/systems/Booster/dwf16.slurm @@ -0,0 +1,29 @@ +#!/bin/sh +#SBATCH --account=gm2dwf +#SBATCH --nodes=16 +#SBATCH --ntasks=64 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=12 +#SBATCH --time=0:30:00 +#SBATCH --partition=booster +#SBATCH --gres=gpu:4 + +export OMP_NUM_THREADS=4 +export OMPI_MCA_btl=^uct,openib +export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=yes +export UCX_MEMTYPE_CACHE=n +OPT="--comms-overlap --comms-concurrent" + + +srun -N 16 -n $SLURM_NTASKS \ + ./benchmarks/Benchmark_dwf_fp32 \ + $OPT \ + --mpi 2.2.2.8 \ + --accelerator-threads 8 \ + --grid 64.64.64.256 \ + --shm 2048 > dwf.16node.perf + + diff --git a/systems/Booster/dwf4.slurm b/systems/Booster/dwf4.slurm new file mode 100644 index 00000000..e8f4e738 --- /dev/null +++ b/systems/Booster/dwf4.slurm @@ -0,0 +1,39 @@ +#!/bin/sh +#SBATCH --account=gm2dwf +#SBATCH --nodes=4 +#SBATCH --ntasks=16 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=12 +#SBATCH --time=2:00:00 +#SBATCH --partition=develbooster +#SBATCH --gres=gpu:4 + +export OMP_NUM_THREADS=4 +export OMPI_MCA_btl=^uct,openib +export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=yes +export UCX_MEMTYPE_CACHE=n + +OPT="--comms-overlap --comms-concurrent" + +srun -N 4 -n $SLURM_NTASKS \ + ./benchmarks/Benchmark_dwf_fp32 \ + $OPT \ + --mpi 2.2.2.2 \ + --accelerator-threads 8 \ + --grid 64.64.64.64 \ + --shm 2048 > dwf.4node.perf + + +srun -N 4 -n $SLURM_NTASKS \ + ./benchmarks/Benchmark_comms_host_device \ + --mpi 2.2.2.2 \ + --accelerator-threads 8 \ + --grid 64.64.64.64 \ + --shm 2048 > comms.4node.perf + + + + diff --git a/systems/Booster/sourceme.sh b/systems/Booster/sourceme.sh new file mode 100644 index 00000000..56499be4 --- /dev/null +++ b/systems/Booster/sourceme.sh @@ -0,0 +1,5 @@ +module load GCC/9.3.0 +module load GMP/6.2.0 +module load MPFR/4.1.0 +module load OpenMPI/4.1.0rc1 +module load CUDA/11.3 diff --git a/systems/Perlmutter/comms.4node b/systems/Perlmutter/comms.4node new file mode 100644 index 00000000..25da2164 --- /dev/null +++ b/systems/Perlmutter/comms.4node @@ -0,0 +1,129 @@ +SLURM detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42506321920 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 2 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-setdevice=no +AcceleratorCudaInit: ================================================ +SharedMemoryMpi: World communicator of size 16 +SharedMemoryMpi: Node communicator of size 4 +0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 1073741824bytes at 0x7f8d40000000 for comms buffers +Setting up IPC + +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|_ | | | | | | | | | | | | _|__ +__|_ _|__ +__|_ GGGG RRRR III DDDD _|__ +__|_ G R R I D D _|__ +__|_ G R R I D D _|__ +__|_ G GG RRRR I D D _|__ +__|_ G G R R I D D _|__ +__|_ GGGG R R III DDDD _|__ +__|_ _|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ + | | | | | | | | | | | | | | + + +Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes + +Grid : Message : ================================================ +Grid : Message : MPI is initialised and logging filters activated +Grid : Message : ================================================ +Grid : Message : Requested 1073741824 byte stencil comms buffers +Grid : Message : MemoryManager Cache 34005057536 bytes +Grid : Message : MemoryManager::Init() setting up +Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8 +Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory +Grid : Message : MemoryManager::Init() Using cudaMalloc +Grid : Message : 0.956704 s : Grid is setup to use 32 threads +Grid : Message : 0.956709 s : Number of iterations to average: 250 +Grid : Message : 0.956712 s : ==================================================================================================== +Grid : Message : 0.956713 s : = Benchmarking sequential halo exchange from host memory +Grid : Message : 0.956714 s : ==================================================================================================== +Grid : Message : 0.956715 s : L Ls bytes MB/s uni MB/s bidi +Grid : Message : 1.108420 s : 8 8 393216 15427.2 30854.4 +Grid : Message : 1.198740 s : 8 8 393216 87332.8 174665.6 +Grid : Message : 1.574400 s : 8 8 393216 20938.0 41876.0 +Grid : Message : 1.956280 s : 8 8 393216 20598.0 41196.0 +Grid : Message : 1.125254 s : 12 8 1327104 105614.9 211229.8 +Grid : Message : 1.149709 s : 12 8 1327104 108578.8 217157.5 +Grid : Message : 1.262612 s : 12 8 1327104 23510.2 47020.4 +Grid : Message : 1.377804 s : 12 8 1327104 23043.0 46086.0 +Grid : Message : 1.445986 s : 16 8 3145728 107931.9 215863.7 +Grid : Message : 1.501495 s : 16 8 3145728 113380.0 226760.0 +Grid : Message : 1.766377 s : 16 8 3145728 23752.8 47505.6 +Grid : Message : 2.301720 s : 16 8 3145728 23850.6 47701.2 +Grid : Message : 2.158035 s : 20 8 6144000 109657.5 219315.0 +Grid : Message : 2.268232 s : 20 8 6144000 111535.7 223071.4 +Grid : Message : 2.779996 s : 20 8 6144000 24011.8 48023.6 +Grid : Message : 3.289081 s : 20 8 6144000 24137.8 48275.7 +Grid : Message : 3.549101 s : 24 8 10616832 89696.1 179392.2 +Grid : Message : 3.779416 s : 24 8 10616832 92205.2 184410.4 +Grid : Message : 4.656539 s : 24 8 10616832 24209.0 48417.9 +Grid : Message : 5.531893 s : 24 8 10616832 24257.5 48515.0 +Grid : Message : 6.800400 s : 28 8 16859136 76106.8 152213.6 +Grid : Message : 6.443946 s : 28 8 16859136 77350.6 154701.1 +Grid : Message : 7.830994 s : 28 8 16859136 24309.8 48619.6 +Grid : Message : 9.215301 s : 28 8 16859136 24357.8 48715.5 +Grid : Message : 9.955615 s : 32 8 25165824 72403.7 144807.4 +Grid : Message : 10.648284 s : 32 8 25165824 72666.2 145332.4 +Grid : Message : 12.713098 s : 32 8 25165824 24376.2 48752.3 +Grid : Message : 14.775577 s : 32 8 25165824 24403.6 48807.3 +Grid : Message : 14.777794 s : ==================================================================================================== +Grid : Message : 14.777799 s : = Benchmarking sequential halo exchange from GPU memory +Grid : Message : 14.777800 s : ==================================================================================================== +Grid : Message : 14.777801 s : L Ls bytes MB/s uni MB/s bidi +Grid : Message : 14.798392 s : 8 8 393216 49210.4 98420.9 +Grid : Message : 14.812519 s : 8 8 393216 55716.0 111432.1 +Grid : Message : 14.861908 s : 8 8 393216 15926.4 31852.9 +Grid : Message : 14.909307 s : 8 8 393216 16594.5 33189.1 +Grid : Message : 14.938366 s : 12 8 1327104 157435.7 314871.3 +Grid : Message : 14.954490 s : 12 8 1327104 164724.6 329449.3 +Grid : Message : 15.921650 s : 12 8 1327104 19280.2 38560.4 +Grid : Message : 15.229618 s : 12 8 1327104 19311.3 38622.7 +Grid : Message : 15.275707 s : 16 8 3145728 221257.5 442514.9 +Grid : Message : 15.303489 s : 16 8 3145728 226547.7 453095.4 +Grid : Message : 15.619610 s : 16 8 3145728 19902.6 39805.2 +Grid : Message : 15.935287 s : 16 8 3145728 19930.6 39861.2 +Grid : Message : 15.999038 s : 20 8 6144000 269586.0 539172.0 +Grid : Message : 16.435890 s : 20 8 6144000 275886.8 551773.7 +Grid : Message : 16.652349 s : 20 8 6144000 20185.6 40371.2 +Grid : Message : 17.262005 s : 20 8 6144000 20156.0 40311.9 +Grid : Message : 17.351417 s : 24 8 10616832 300428.2 600856.4 +Grid : Message : 17.421125 s : 24 8 10616832 304656.8 609313.6 +Grid : Message : 18.477072 s : 24 8 10616832 20108.9 40217.7 +Grid : Message : 19.556481 s : 24 8 10616832 19671.8 39343.6 +Grid : Message : 19.681365 s : 28 8 16859136 318966.5 637933.1 +Grid : Message : 19.786400 s : 28 8 16859136 321056.1 642112.1 +Grid : Message : 21.531557 s : 28 8 16859136 19321.2 38642.4 +Grid : Message : 23.384312 s : 28 8 16859136 18199.2 36398.3 +Grid : Message : 23.556358 s : 32 8 25165824 332397.6 664795.2 +Grid : Message : 23.706392 s : 32 8 25165824 335492.9 670985.8 +Grid : Message : 26.356425 s : 32 8 25165824 18992.9 37985.9 +Grid : Message : 29.126692 s : 32 8 25165824 18168.6 36337.3 +Grid : Message : 29.137480 s : ==================================================================================================== +Grid : Message : 29.137485 s : = All done; Bye Bye +Grid : Message : 29.137486 s : ==================================================================================================== diff --git a/systems/Perlmutter/config-command b/systems/Perlmutter/config-command new file mode 100644 index 00000000..b399c535 --- /dev/null +++ b/systems/Perlmutter/config-command @@ -0,0 +1,12 @@ +../../configure \ + --enable-comms=mpi \ + --enable-simd=GPU \ + --enable-shm=nvlink \ + --enable-gen-simd-width=64 \ + --enable-accelerator=cuda \ + --disable-fermion-reps \ + --disable-unified \ + --disable-gparity \ + CXX=nvcc \ + LDFLAGS="-cudart shared " \ + CXXFLAGS="-ccbin CC -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared" diff --git a/systems/Perlmutter/dwf.48.48.48.48.4node.opt0 b/systems/Perlmutter/dwf.48.48.48.48.4node.opt0 new file mode 100644 index 00000000..30e3194a --- /dev/null +++ b/systems/Perlmutter/dwf.48.48.48.48.4node.opt0 @@ -0,0 +1,156 @@ +SLURM detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42506321920 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 2 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-setdevice=no +AcceleratorCudaInit: ================================================ +SharedMemoryMpi: World communicator of size 16 +SharedMemoryMpi: Node communicator of size 4 +0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7fc320000000 for comms buffers +Setting up IPC + +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|_ | | | | | | | | | | | | _|__ +__|_ _|__ +__|_ GGGG RRRR III DDDD _|__ +__|_ G R R I D D _|__ +__|_ G R R I D D _|__ +__|_ G GG RRRR I D D _|__ +__|_ G G R R I D D _|__ +__|_ GGGG R R III DDDD _|__ +__|_ _|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ + | | | | | | | | | | | | | | + + +Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes + +Grid : Message : ================================================ +Grid : Message : MPI is initialised and logging filters activated +Grid : Message : ================================================ +Grid : Message : Requested 2147483648 byte stencil comms buffers +Grid : Message : MemoryManager Cache 34005057536 bytes +Grid : Message : MemoryManager::Init() setting up +Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8 +Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory +Grid : Message : MemoryManager::Init() Using cudaMalloc +Grid : Message : 0.762377 s : Grid Layout +Grid : Message : 0.762378 s : Global lattice size : 48 48 48 48 +Grid : Message : 0.762381 s : OpenMP threads : 32 +Grid : Message : 0.762382 s : MPI tasks : 2 2 2 2 +Grid : Message : 0.790912 s : Making s innermost grids +Grid : Message : 0.817408 s : Initialising 4d RNG +Grid : Message : 0.840908 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 0.840921 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 0.911684 s : Initialising 5d RNG +Grid : Message : 1.270530 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 1.270544 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 1.568435 s : Initialised RNGs +Grid : Message : 2.241446 s : Drawing gauge field +Grid : Message : 2.318921 s : Random gauge initialised +Grid : Message : 2.779258 s : Setting up Cshift based reference +Grid : Message : 3.188306 s : ***************************************************************** +Grid : Message : 3.188315 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 3.188316 s : ***************************************************************** +Grid : Message : 3.188316 s : ***************************************************************** +Grid : Message : 3.188316 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 3.188316 s : * Vectorising space-time by 8 +Grid : Message : 3.188317 s : * VComplexF size is 64 B +Grid : Message : 3.188318 s : * SINGLE precision +Grid : Message : 3.188318 s : * Using Overlapped Comms/Compute +Grid : Message : 3.188318 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 3.188318 s : ***************************************************************** +Grid : Message : 3.548355 s : Called warmup +Grid : Message : 37.809000 s : Called Dw 3000 times in 3.42606e+07 us +Grid : Message : 37.809040 s : mflop/s = 9.81714e+06 +Grid : Message : 37.809042 s : mflop/s per rank = 613572 +Grid : Message : 37.809043 s : mflop/s per node = 2.45429e+06 +Grid : Message : 37.809044 s : RF GiB/s (base 2) = 19948.2 +Grid : Message : 37.809045 s : mem GiB/s (base 2) = 12467.6 +Grid : Message : 37.810181 s : norm diff 1.03662e-13 +Grid : Message : 37.824163 s : #### Dhop calls report +Grid : Message : 37.824168 s : WilsonFermion5D Number of DhopEO Calls : 6002 +Grid : Message : 37.824172 s : WilsonFermion5D TotalTime /Calls : 5719.36 us +Grid : Message : 37.824173 s : WilsonFermion5D CommTime /Calls : 5085.34 us +Grid : Message : 37.824174 s : WilsonFermion5D FaceTime /Calls : 265.445 us +Grid : Message : 37.824175 s : WilsonFermion5D ComputeTime1/Calls : 23.4602 us +Grid : Message : 37.824176 s : WilsonFermion5D ComputeTime2/Calls : 370.89 us +Grid : Message : 37.824191 s : Average mflops/s per call : 2.36923e+09 +Grid : Message : 37.824194 s : Average mflops/s per call per rank : 1.48077e+08 +Grid : Message : 37.824195 s : Average mflops/s per call per node : 5.92307e+08 +Grid : Message : 37.824196 s : Average mflops/s per call (full) : 9.97945e+06 +Grid : Message : 37.824197 s : Average mflops/s per call per rank (full): 623716 +Grid : Message : 37.824198 s : Average mflops/s per call per node (full): 2.49486e+06 +Grid : Message : 37.824199 s : WilsonFermion5D Stencil +Grid : Message : 37.824199 s : WilsonFermion5D StencilEven +Grid : Message : 37.824199 s : WilsonFermion5D StencilOdd +Grid : Message : 37.824199 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 37.824199 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 37.824199 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 41.538537 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 41.538549 s : Called DwDag +Grid : Message : 41.538550 s : norm dag result 12.0422 +Grid : Message : 41.543416 s : norm dag ref 12.0422 +Grid : Message : 41.548999 s : norm dag diff 7.6086e-14 +Grid : Message : 41.563564 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec +Grid : Message : 41.711516 s : src_e0.499992 +Grid : Message : 41.735103 s : src_o0.500008 +Grid : Message : 41.756142 s : ********************************************************* +Grid : Message : 41.756144 s : * Benchmarking DomainWallFermionF::DhopEO +Grid : Message : 41.756145 s : * Vectorising space-time by 8 +Grid : Message : 41.756146 s : * SINGLE precision +Grid : Message : 41.756147 s : * Using Overlapped Comms/Compute +Grid : Message : 41.756148 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 41.756148 s : ********************************************************* +Grid : Message : 59.255023 s : Deo mflop/s = 9.6274e+06 +Grid : Message : 59.255044 s : Deo mflop/s per rank 601712 +Grid : Message : 59.255046 s : Deo mflop/s per node 2.40685e+06 +Grid : Message : 59.255048 s : #### Dhop calls report +Grid : Message : 59.255049 s : WilsonFermion5D Number of DhopEO Calls : 3001 +Grid : Message : 59.255050 s : WilsonFermion5D TotalTime /Calls : 5830.89 us +Grid : Message : 59.255051 s : WilsonFermion5D CommTime /Calls : 5143.28 us +Grid : Message : 59.255052 s : WilsonFermion5D FaceTime /Calls : 316.834 us +Grid : Message : 59.255053 s : WilsonFermion5D ComputeTime1/Calls : 37.4065 us +Grid : Message : 59.255054 s : WilsonFermion5D ComputeTime2/Calls : 375.889 us +Grid : Message : 59.255076 s : Average mflops/s per call : 1.4225e+09 +Grid : Message : 59.255077 s : Average mflops/s per call per rank : 8.8906e+07 +Grid : Message : 59.255078 s : Average mflops/s per call per node : 3.55624e+08 +Grid : Message : 59.255079 s : Average mflops/s per call (full) : 9.78858e+06 +Grid : Message : 59.255080 s : Average mflops/s per call per rank (full): 611786 +Grid : Message : 59.255081 s : Average mflops/s per call per node (full): 2.44714e+06 +Grid : Message : 59.255082 s : WilsonFermion5D Stencil +Grid : Message : 59.255082 s : WilsonFermion5D StencilEven +Grid : Message : 59.255082 s : WilsonFermion5D StencilOdd +Grid : Message : 59.255082 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 59.255082 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 59.255082 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 59.286796 s : r_e6.02129 +Grid : Message : 59.290118 s : r_o6.02097 +Grid : Message : 59.292558 s : res12.0423 +Grid : Message : 59.482803 s : norm diff 0 +Grid : Message : 59.604297 s : norm diff even 0 +Grid : Message : 59.626743 s : norm diff odd 0 diff --git a/systems/Perlmutter/dwf.48.48.48.48.4node.opt1 b/systems/Perlmutter/dwf.48.48.48.48.4node.opt1 new file mode 100644 index 00000000..f54e9d14 --- /dev/null +++ b/systems/Perlmutter/dwf.48.48.48.48.4node.opt1 @@ -0,0 +1,156 @@ +SLURM detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42506321920 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 2 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-setdevice=no +AcceleratorCudaInit: ================================================ +SharedMemoryMpi: World communicator of size 16 +SharedMemoryMpi: Node communicator of size 4 +0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7fbae0000000 for comms buffers +Setting up IPC + +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|_ | | | | | | | | | | | | _|__ +__|_ _|__ +__|_ GGGG RRRR III DDDD _|__ +__|_ G R R I D D _|__ +__|_ G R R I D D _|__ +__|_ G GG RRRR I D D _|__ +__|_ G G R R I D D _|__ +__|_ GGGG R R III DDDD _|__ +__|_ _|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ + | | | | | | | | | | | | | | + + +Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes + +Grid : Message : ================================================ +Grid : Message : MPI is initialised and logging filters activated +Grid : Message : ================================================ +Grid : Message : Requested 2147483648 byte stencil comms buffers +Grid : Message : MemoryManager Cache 34005057536 bytes +Grid : Message : MemoryManager::Init() setting up +Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8 +Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory +Grid : Message : MemoryManager::Init() Using cudaMalloc +Grid : Message : 0.692368 s : Grid Layout +Grid : Message : 0.692369 s : Global lattice size : 48 48 48 48 +Grid : Message : 0.692372 s : OpenMP threads : 32 +Grid : Message : 0.692372 s : MPI tasks : 2 2 2 2 +Grid : Message : 0.701977 s : Making s innermost grids +Grid : Message : 0.711295 s : Initialising 4d RNG +Grid : Message : 0.734938 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 0.734948 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 0.798281 s : Initialising 5d RNG +Grid : Message : 1.161711 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 1.161728 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 1.522440 s : Initialised RNGs +Grid : Message : 2.260710 s : Drawing gauge field +Grid : Message : 2.102597 s : Random gauge initialised +Grid : Message : 2.562592 s : Setting up Cshift based reference +Grid : Message : 3.121880 s : ***************************************************************** +Grid : Message : 3.121970 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 3.121980 s : ***************************************************************** +Grid : Message : 3.121980 s : ***************************************************************** +Grid : Message : 3.121980 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 3.121980 s : * Vectorising space-time by 8 +Grid : Message : 3.121980 s : * VComplexF size is 64 B +Grid : Message : 3.121990 s : * SINGLE precision +Grid : Message : 3.121990 s : * Using Overlapped Comms/Compute +Grid : Message : 3.121990 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 3.121990 s : ***************************************************************** +Grid : Message : 3.350688 s : Called warmup +Grid : Message : 35.847527 s : Called Dw 3000 times in 3.24968e+07 us +Grid : Message : 35.847576 s : mflop/s = 1.035e+07 +Grid : Message : 35.847578 s : mflop/s per rank = 646874 +Grid : Message : 35.847579 s : mflop/s per node = 2.5875e+06 +Grid : Message : 35.847580 s : RF GiB/s (base 2) = 21030.9 +Grid : Message : 35.847581 s : mem GiB/s (base 2) = 13144.3 +Grid : Message : 35.848697 s : norm diff 1.03662e-13 +Grid : Message : 35.861967 s : #### Dhop calls report +Grid : Message : 35.861973 s : WilsonFermion5D Number of DhopEO Calls : 6002 +Grid : Message : 35.861976 s : WilsonFermion5D TotalTime /Calls : 5426 us +Grid : Message : 35.861977 s : WilsonFermion5D CommTime /Calls : 4817.47 us +Grid : Message : 35.861978 s : WilsonFermion5D FaceTime /Calls : 246.175 us +Grid : Message : 35.861979 s : WilsonFermion5D ComputeTime1/Calls : 8.72676 us +Grid : Message : 35.861980 s : WilsonFermion5D ComputeTime2/Calls : 370.494 us +Grid : Message : 35.861995 s : Average mflops/s per call : 6.50606e+09 +Grid : Message : 35.861999 s : Average mflops/s per call per rank : 4.06629e+08 +Grid : Message : 35.862000 s : Average mflops/s per call per node : 1.62652e+09 +Grid : Message : 35.862001 s : Average mflops/s per call (full) : 1.0519e+07 +Grid : Message : 35.862002 s : Average mflops/s per call per rank (full): 657438 +Grid : Message : 35.862003 s : Average mflops/s per call per node (full): 2.62975e+06 +Grid : Message : 35.862004 s : WilsonFermion5D Stencil +Grid : Message : 35.862004 s : WilsonFermion5D StencilEven +Grid : Message : 35.862004 s : WilsonFermion5D StencilOdd +Grid : Message : 35.862004 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 35.862004 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 35.862004 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 39.599406 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 39.599421 s : Called DwDag +Grid : Message : 39.599422 s : norm dag result 12.0422 +Grid : Message : 39.604317 s : norm dag ref 12.0422 +Grid : Message : 39.609961 s : norm dag diff 7.6086e-14 +Grid : Message : 39.624145 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec +Grid : Message : 39.772334 s : src_e0.499992 +Grid : Message : 39.795705 s : src_o0.500008 +Grid : Message : 39.816822 s : ********************************************************* +Grid : Message : 39.816824 s : * Benchmarking DomainWallFermionF::DhopEO +Grid : Message : 39.816825 s : * Vectorising space-time by 8 +Grid : Message : 39.816826 s : * SINGLE precision +Grid : Message : 39.816827 s : * Using Overlapped Comms/Compute +Grid : Message : 39.816828 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 39.816828 s : ********************************************************* +Grid : Message : 56.382758 s : Deo mflop/s = 1.017e+07 +Grid : Message : 56.382779 s : Deo mflop/s per rank 635627 +Grid : Message : 56.382781 s : Deo mflop/s per node 2.54251e+06 +Grid : Message : 56.382783 s : #### Dhop calls report +Grid : Message : 56.382784 s : WilsonFermion5D Number of DhopEO Calls : 3001 +Grid : Message : 56.382785 s : WilsonFermion5D TotalTime /Calls : 5519.98 us +Grid : Message : 56.382786 s : WilsonFermion5D CommTime /Calls : 4856.39 us +Grid : Message : 56.382787 s : WilsonFermion5D FaceTime /Calls : 303.043 us +Grid : Message : 56.382788 s : WilsonFermion5D ComputeTime1/Calls : 6.77807 us +Grid : Message : 56.382789 s : WilsonFermion5D ComputeTime2/Calls : 376.551 us +Grid : Message : 56.382810 s : Average mflops/s per call : 8.31124e+09 +Grid : Message : 56.382811 s : Average mflops/s per call per rank : 5.19453e+08 +Grid : Message : 56.382812 s : Average mflops/s per call per node : 2.07781e+09 +Grid : Message : 56.382813 s : Average mflops/s per call (full) : 1.03399e+07 +Grid : Message : 56.382814 s : Average mflops/s per call per rank (full): 646244 +Grid : Message : 56.382815 s : Average mflops/s per call per node (full): 2.58498e+06 +Grid : Message : 56.382816 s : WilsonFermion5D Stencil +Grid : Message : 56.382816 s : WilsonFermion5D StencilEven +Grid : Message : 56.382816 s : WilsonFermion5D StencilOdd +Grid : Message : 56.382816 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 56.382816 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 56.382816 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 56.414571 s : r_e6.02129 +Grid : Message : 56.417837 s : r_o6.02097 +Grid : Message : 56.420535 s : res12.0423 +Grid : Message : 56.611957 s : norm diff 0 +Grid : Message : 56.730597 s : norm diff even 0 +Grid : Message : 56.752566 s : norm diff odd 0 diff --git a/systems/Perlmutter/dwf.64.64.64.64.4node.opt0 b/systems/Perlmutter/dwf.64.64.64.64.4node.opt0 new file mode 100644 index 00000000..b16a3219 --- /dev/null +++ b/systems/Perlmutter/dwf.64.64.64.64.4node.opt0 @@ -0,0 +1,156 @@ +SLURM detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42506321920 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 2 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-setdevice=no +AcceleratorCudaInit: ================================================ +SharedMemoryMpi: World communicator of size 16 +SharedMemoryMpi: Node communicator of size 4 +0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7fd460000000 for comms buffers +Setting up IPC + +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|_ | | | | | | | | | | | | _|__ +__|_ _|__ +__|_ GGGG RRRR III DDDD _|__ +__|_ G R R I D D _|__ +__|_ G R R I D D _|__ +__|_ G GG RRRR I D D _|__ +__|_ G G R R I D D _|__ +__|_ GGGG R R III DDDD _|__ +__|_ _|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ + | | | | | | | | | | | | | | + + +Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes + +Grid : Message : ================================================ +Grid : Message : MPI is initialised and logging filters activated +Grid : Message : ================================================ +Grid : Message : Requested 2147483648 byte stencil comms buffers +Grid : Message : MemoryManager Cache 34005057536 bytes +Grid : Message : MemoryManager::Init() setting up +Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8 +Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory +Grid : Message : MemoryManager::Init() Using cudaMalloc +Grid : Message : 0.667601 s : Grid Layout +Grid : Message : 0.667602 s : Global lattice size : 64 64 64 64 +Grid : Message : 0.667610 s : OpenMP threads : 32 +Grid : Message : 0.667611 s : MPI tasks : 2 2 2 2 +Grid : Message : 0.702872 s : Making s innermost grids +Grid : Message : 0.742911 s : Initialising 4d RNG +Grid : Message : 0.813463 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 0.813479 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 0.922630 s : Initialising 5d RNG +Grid : Message : 2.306290 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 2.306540 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 3.878430 s : Initialised RNGs +Grid : Message : 4.536926 s : Drawing gauge field +Grid : Message : 4.824391 s : Random gauge initialised +Grid : Message : 6.253195 s : Setting up Cshift based reference +Grid : Message : 7.326402 s : ***************************************************************** +Grid : Message : 7.326411 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 7.326412 s : ***************************************************************** +Grid : Message : 7.326412 s : ***************************************************************** +Grid : Message : 7.326412 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 7.326412 s : * Vectorising space-time by 8 +Grid : Message : 7.326413 s : * VComplexF size is 64 B +Grid : Message : 7.326414 s : * SINGLE precision +Grid : Message : 7.326414 s : * Using Overlapped Comms/Compute +Grid : Message : 7.326414 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 7.326414 s : ***************************************************************** +Grid : Message : 8.283417 s : Called warmup +Grid : Message : 89.658859 s : Called Dw 3000 times in 8.13753e+07 us +Grid : Message : 89.658898 s : mflop/s = 1.3063e+07 +Grid : Message : 89.658900 s : mflop/s per rank = 816437 +Grid : Message : 89.658901 s : mflop/s per node = 3.26575e+06 +Grid : Message : 89.658902 s : RF GiB/s (base 2) = 26543.7 +Grid : Message : 89.658903 s : mem GiB/s (base 2) = 16589.8 +Grid : Message : 89.662424 s : norm diff 1.03481e-13 +Grid : Message : 89.700433 s : #### Dhop calls report +Grid : Message : 89.700452 s : WilsonFermion5D Number of DhopEO Calls : 6002 +Grid : Message : 89.700456 s : WilsonFermion5D TotalTime /Calls : 13588.2 us +Grid : Message : 89.700457 s : WilsonFermion5D CommTime /Calls : 12137.3 us +Grid : Message : 89.700458 s : WilsonFermion5D FaceTime /Calls : 548.408 us +Grid : Message : 89.700459 s : WilsonFermion5D ComputeTime1/Calls : 42.6163 us +Grid : Message : 89.700460 s : WilsonFermion5D ComputeTime2/Calls : 910.312 us +Grid : Message : 89.700477 s : Average mflops/s per call : 4.43502e+09 +Grid : Message : 89.700493 s : Average mflops/s per call per rank : 2.77189e+08 +Grid : Message : 89.700494 s : Average mflops/s per call per node : 1.10875e+09 +Grid : Message : 89.700495 s : Average mflops/s per call (full) : 1.32753e+07 +Grid : Message : 89.700496 s : Average mflops/s per call per rank (full): 829709 +Grid : Message : 89.700497 s : Average mflops/s per call per node (full): 3.31884e+06 +Grid : Message : 89.700498 s : WilsonFermion5D Stencil +Grid : Message : 89.700498 s : WilsonFermion5D StencilEven +Grid : Message : 89.700498 s : WilsonFermion5D StencilOdd +Grid : Message : 89.700499 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 89.700499 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 89.700499 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 101.462401 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 101.462412 s : Called DwDag +Grid : Message : 101.462413 s : norm dag result 12.0421 +Grid : Message : 101.474097 s : norm dag ref 12.0421 +Grid : Message : 101.489396 s : norm dag diff 7.63236e-14 +Grid : Message : 101.529094 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec +Grid : Message : 101.996820 s : src_e0.499997 +Grid : Message : 102.626690 s : src_o0.500003 +Grid : Message : 102.125734 s : ********************************************************* +Grid : Message : 102.125736 s : * Benchmarking DomainWallFermionF::DhopEO +Grid : Message : 102.125737 s : * Vectorising space-time by 8 +Grid : Message : 102.125738 s : * SINGLE precision +Grid : Message : 102.125739 s : * Using Overlapped Comms/Compute +Grid : Message : 102.125739 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 102.125739 s : ********************************************************* +Grid : Message : 143.296910 s : Deo mflop/s = 1.30119e+07 +Grid : Message : 143.297140 s : Deo mflop/s per rank 813244 +Grid : Message : 143.297160 s : Deo mflop/s per node 3.25297e+06 +Grid : Message : 143.297180 s : #### Dhop calls report +Grid : Message : 143.297190 s : WilsonFermion5D Number of DhopEO Calls : 3001 +Grid : Message : 143.297200 s : WilsonFermion5D TotalTime /Calls : 13630 us +Grid : Message : 143.297210 s : WilsonFermion5D CommTime /Calls : 12124.9 us +Grid : Message : 143.297220 s : WilsonFermion5D FaceTime /Calls : 590.958 us +Grid : Message : 143.297230 s : WilsonFermion5D ComputeTime1/Calls : 43.2806 us +Grid : Message : 143.297240 s : WilsonFermion5D ComputeTime2/Calls : 921.187 us +Grid : Message : 143.297460 s : Average mflops/s per call : 4.24329e+09 +Grid : Message : 143.297470 s : Average mflops/s per call per rank : 2.65206e+08 +Grid : Message : 143.297480 s : Average mflops/s per call per node : 1.06082e+09 +Grid : Message : 143.297490 s : Average mflops/s per call (full) : 1.32347e+07 +Grid : Message : 143.297500 s : Average mflops/s per call per rank (full): 827169 +Grid : Message : 143.297510 s : Average mflops/s per call per node (full): 3.30868e+06 +Grid : Message : 143.297520 s : WilsonFermion5D Stencil +Grid : Message : 143.297520 s : WilsonFermion5D StencilEven +Grid : Message : 143.297520 s : WilsonFermion5D StencilOdd +Grid : Message : 143.297520 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 143.297520 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 143.297520 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 143.112368 s : r_e6.02111 +Grid : Message : 143.119760 s : r_o6.02102 +Grid : Message : 143.126239 s : res12.0421 +Grid : Message : 143.720780 s : norm diff 0 +Grid : Message : 144.885380 s : norm diff even 0 +Grid : Message : 144.154396 s : norm diff odd 0 diff --git a/systems/Perlmutter/dwf.64.64.64.64.4node.opt1 b/systems/Perlmutter/dwf.64.64.64.64.4node.opt1 new file mode 100644 index 00000000..d48f8126 --- /dev/null +++ b/systems/Perlmutter/dwf.64.64.64.64.4node.opt1 @@ -0,0 +1,156 @@ +SLURM detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42506321920 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 2 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-setdevice=no +AcceleratorCudaInit: ================================================ +SharedMemoryMpi: World communicator of size 16 +SharedMemoryMpi: Node communicator of size 4 +0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7f4b80000000 for comms buffers +Setting up IPC + +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|_ | | | | | | | | | | | | _|__ +__|_ _|__ +__|_ GGGG RRRR III DDDD _|__ +__|_ G R R I D D _|__ +__|_ G R R I D D _|__ +__|_ G GG RRRR I D D _|__ +__|_ G G R R I D D _|__ +__|_ GGGG R R III DDDD _|__ +__|_ _|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ + | | | | | | | | | | | | | | + + +Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +Current Grid git commit hash=b2ccaad761798e93a9314f97d8a4d1f851c6962a: (HEAD -> develop) uncommited changes + +Grid : Message : ================================================ +Grid : Message : MPI is initialised and logging filters activated +Grid : Message : ================================================ +Grid : Message : Requested 2147483648 byte stencil comms buffers +Grid : Message : MemoryManager Cache 34005057536 bytes +Grid : Message : MemoryManager::Init() setting up +Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8 +Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory +Grid : Message : MemoryManager::Init() Using cudaMalloc +Grid : Message : 0.648397 s : Grid Layout +Grid : Message : 0.648398 s : Global lattice size : 64 64 64 64 +Grid : Message : 0.648401 s : OpenMP threads : 32 +Grid : Message : 0.648402 s : MPI tasks : 2 2 2 2 +Grid : Message : 0.663662 s : Making s innermost grids +Grid : Message : 0.682145 s : Initialising 4d RNG +Grid : Message : 0.754321 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 0.754332 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 0.863265 s : Initialising 5d RNG +Grid : Message : 1.967677 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 1.967691 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 2.921676 s : Initialised RNGs +Grid : Message : 4.382384 s : Drawing gauge field +Grid : Message : 4.672590 s : Random gauge initialised +Grid : Message : 6.102697 s : Setting up Cshift based reference +Grid : Message : 7.185897 s : ***************************************************************** +Grid : Message : 7.185906 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 7.185907 s : ***************************************************************** +Grid : Message : 7.185907 s : ***************************************************************** +Grid : Message : 7.185907 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 7.185907 s : * Vectorising space-time by 8 +Grid : Message : 7.185908 s : * VComplexF size is 64 B +Grid : Message : 7.185909 s : * SINGLE precision +Grid : Message : 7.185909 s : * Using Overlapped Comms/Compute +Grid : Message : 7.185909 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 7.185909 s : ***************************************************************** +Grid : Message : 8.114241 s : Called warmup +Grid : Message : 83.988100 s : Called Dw 3000 times in 7.48954e+07 us +Grid : Message : 83.992400 s : mflop/s = 1.41932e+07 +Grid : Message : 83.992600 s : mflop/s per rank = 887074 +Grid : Message : 83.992700 s : mflop/s per node = 3.5483e+06 +Grid : Message : 83.992800 s : RF GiB/s (base 2) = 28840.2 +Grid : Message : 83.992900 s : mem GiB/s (base 2) = 18025.1 +Grid : Message : 83.134870 s : norm diff 1.03481e-13 +Grid : Message : 83.493960 s : #### Dhop calls report +Grid : Message : 83.494000 s : WilsonFermion5D Number of DhopEO Calls : 6002 +Grid : Message : 83.494030 s : WilsonFermion5D TotalTime /Calls : 12506 us +Grid : Message : 83.494040 s : WilsonFermion5D CommTime /Calls : 11071.5 us +Grid : Message : 83.494050 s : WilsonFermion5D FaceTime /Calls : 530.971 us +Grid : Message : 83.494060 s : WilsonFermion5D ComputeTime1/Calls : 23.6428 us +Grid : Message : 83.494070 s : WilsonFermion5D ComputeTime2/Calls : 911.864 us +Grid : Message : 83.494220 s : Average mflops/s per call : 7.6108e+09 +Grid : Message : 83.494250 s : Average mflops/s per call per rank : 4.75675e+08 +Grid : Message : 83.494260 s : Average mflops/s per call per node : 1.9027e+09 +Grid : Message : 83.494270 s : Average mflops/s per call (full) : 1.44242e+07 +Grid : Message : 83.494280 s : Average mflops/s per call per rank (full): 901513 +Grid : Message : 83.494290 s : Average mflops/s per call per node (full): 3.60605e+06 +Grid : Message : 83.494300 s : WilsonFermion5D Stencil +Grid : Message : 83.494300 s : WilsonFermion5D StencilEven +Grid : Message : 83.494300 s : WilsonFermion5D StencilOdd +Grid : Message : 83.494300 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 83.494300 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 83.494300 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 94.600488 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 94.600501 s : Called DwDag +Grid : Message : 94.600502 s : norm dag result 12.0421 +Grid : Message : 94.613445 s : norm dag ref 12.0421 +Grid : Message : 94.628514 s : norm dag diff 7.63236e-14 +Grid : Message : 94.666370 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec +Grid : Message : 95.136361 s : src_e0.499997 +Grid : Message : 95.208108 s : src_o0.500003 +Grid : Message : 95.271511 s : ********************************************************* +Grid : Message : 95.271512 s : * Benchmarking DomainWallFermionF::DhopEO +Grid : Message : 95.271513 s : * Vectorising space-time by 8 +Grid : Message : 95.271514 s : * SINGLE precision +Grid : Message : 95.271514 s : * Using Overlapped Comms/Compute +Grid : Message : 95.271515 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 95.271515 s : ********************************************************* +Grid : Message : 132.766274 s : Deo mflop/s = 1.41952e+07 +Grid : Message : 132.766295 s : Deo mflop/s per rank 887201 +Grid : Message : 132.766297 s : Deo mflop/s per node 3.5488e+06 +Grid : Message : 132.766299 s : #### Dhop calls report +Grid : Message : 132.766300 s : WilsonFermion5D Number of DhopEO Calls : 3001 +Grid : Message : 132.766301 s : WilsonFermion5D TotalTime /Calls : 12493.9 us +Grid : Message : 132.766302 s : WilsonFermion5D CommTime /Calls : 10990.2 us +Grid : Message : 132.766303 s : WilsonFermion5D FaceTime /Calls : 604.889 us +Grid : Message : 132.766304 s : WilsonFermion5D ComputeTime1/Calls : 13.7158 us +Grid : Message : 132.766305 s : WilsonFermion5D ComputeTime2/Calls : 920.096 us +Grid : Message : 132.766326 s : Average mflops/s per call : 1.31121e+10 +Grid : Message : 132.766328 s : Average mflops/s per call per rank : 8.19504e+08 +Grid : Message : 132.766329 s : Average mflops/s per call per node : 3.27802e+09 +Grid : Message : 132.766330 s : Average mflops/s per call (full) : 1.44381e+07 +Grid : Message : 132.766331 s : Average mflops/s per call per rank (full): 902382 +Grid : Message : 132.766332 s : Average mflops/s per call per node (full): 3.60953e+06 +Grid : Message : 132.766333 s : WilsonFermion5D Stencil +Grid : Message : 132.766333 s : WilsonFermion5D StencilEven +Grid : Message : 132.766333 s : WilsonFermion5D StencilOdd +Grid : Message : 132.766333 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 132.766333 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 132.766333 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 132.847999 s : r_e6.02111 +Grid : Message : 132.854237 s : r_o6.02102 +Grid : Message : 132.860309 s : res12.0421 +Grid : Message : 133.458462 s : norm diff 0 +Grid : Message : 133.832713 s : norm diff even 0 +Grid : Message : 133.909147 s : norm diff odd 0 diff --git a/systems/Perlmutter/dwf4.slurm b/systems/Perlmutter/dwf4.slurm new file mode 100644 index 00000000..ba198595 --- /dev/null +++ b/systems/Perlmutter/dwf4.slurm @@ -0,0 +1,24 @@ +#!/bin/bash +#SBATCH -A mp13 +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 0:20:00 +#SBATCH -n 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --exclusive +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=map_gpu:0,1,2,3 + +export SLURM_CPU_BIND="cores" +export MPICH_RDMA_ENABLED_CUDA=1 +export MPICH_GPU_SUPPORT_ENABLED=1 +srun ./benchmarks/Benchmark_comms_host_device --mpi 2.2.2.2 --accelerator-threads 8 > comms.4node + +OPT="--comms-overlap --comms-concurrent --shm-mpi 0" +srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > dwf.64.64.64.64.4node.opt0 +srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 48.48.48.48 --accelerator-threads 8 --shm 2048 $OPT > dwf.48.48.48.48.4node.opt0 + +OPT="--comms-overlap --comms-concurrent --shm-mpi 1" +srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 64.64.64.64 --accelerator-threads 8 --shm 2048 $OPT > dwf.64.64.64.64.4node.opt1 +srun ./benchmarks/Benchmark_dwf_fp32 --mpi 2.2.2.2 --grid 48.48.48.48 --accelerator-threads 8 --shm 2048 $OPT > dwf.48.48.48.48.4node.opt1 diff --git a/systems/Perlmutter/sourceme.sh b/systems/Perlmutter/sourceme.sh new file mode 100644 index 00000000..9359dea9 --- /dev/null +++ b/systems/Perlmutter/sourceme.sh @@ -0,0 +1,4 @@ + +export CRAY_ACCEL_TARGET=nvidia80 + +module load PrgEnv-gnu cpe-cuda cuda diff --git a/systems/Tursa/config-command b/systems/Tursa/config-command new file mode 100644 index 00000000..b47c34e5 --- /dev/null +++ b/systems/Tursa/config-command @@ -0,0 +1,12 @@ +../../configure \ + --enable-comms=mpi \ + --enable-simd=GPU \ + --enable-shm=nvlink \ + --enable-gen-simd-width=64 \ + --enable-accelerator=cuda \ + --with-lime=/mnt/lustre/tursafs1/home/tc002/tc002/dc-boyl1/spack/spack/opt/spack/linux-rhel8-zen/gcc-8.4.1/c-lime-2-3-9-e6wxqrid6rqmd45z7n32dxkvkykpvyez \ + --disable-accelerator-cshift \ + --disable-unified \ + CXX=nvcc \ + LDFLAGS="-cudart shared " \ + CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared" diff --git a/systems/Tursa/dwf.16node.perf b/systems/Tursa/dwf.16node.perf new file mode 100644 index 00000000..a51aae94 --- /dev/null +++ b/systems/Tursa/dwf.16node.perf @@ -0,0 +1,293 @@ +tu-c0r1n00 - 0 device=0 binding=--interleave=0,1 +tu-c0r1n00 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n00 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n00 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n21 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n06 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n12 - 2 device=2 binding=--interleave=4,5 +tu-c0r2n21 - 2 device=2 binding=--interleave=4,5 +tu-c0r2n21 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n21 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n06 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n06 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n12 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n06 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n12 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n12 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n12 - 0 device=0 binding=--interleave=0,1 +tu-c0r1n12 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n12 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n12 - 3 device=3 binding=--interleave=6,7 +tu-c0r1n18 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n18 - 0 device=0 binding=--interleave=0,1 +tu-c0r1n18 - 3 device=3 binding=--interleave=6,7 +tu-c0r1n18 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n06 - 2 device=2 binding=--interleave=4,5 +tu-c0r2n09 - 3 device=3 binding=--interleave=6,7 +tu-c0r1n06 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n15 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n09 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n06 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n15 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n15 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n06 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n15 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n09 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n09 - 0 device=0 binding=--interleave=0,1 +tu-c0r1n09 - 0 device=0 binding=--interleave=0,1 +tu-c0r1n09 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n09 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n09 - 3 device=3 binding=--interleave=6,7 +tu-c0r1n21 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n21 - 0 device=0 binding=--interleave=0,1 +tu-c0r1n21 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n15 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n21 - 3 device=3 binding=--interleave=6,7 +tu-c0r1n15 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n15 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n03 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n15 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n03 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n00 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n03 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n00 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n00 - 2 device=2 binding=--interleave=4,5 +tu-c0r2n18 - 1 device=1 binding=--interleave=2,3 +tu-c0r2n00 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n18 - 2 device=2 binding=--interleave=4,5 +tu-c0r2n18 - 3 device=3 binding=--interleave=6,7 +tu-c0r2n18 - 0 device=0 binding=--interleave=0,1 +tu-c0r2n03 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n03 - 0 device=0 binding=--interleave=0,1 +tu-c0r1n03 - 1 device=1 binding=--interleave=2,3 +tu-c0r1n03 - 2 device=2 binding=--interleave=4,5 +tu-c0r1n03 - 3 device=3 binding=--interleave=6,7 +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42505273344 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 3 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42505273344 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 3 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +SharedMemoryMpi: World communicator of size 64 +SharedMemoryMpi: Node communicator of size 4 +0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7f05c0000000 for comms buffers +Setting up IPC + +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|_ | | | | | | | | | | | | _|__ +__|_ _|__ +__|_ GGGG RRRR III DDDD _|__ +__|_ G R R I D D _|__ +__|_ G R R I D D _|__ +__|_ G GG RRRR I D D _|__ +__|_ G G R R I D D _|__ +__|_ GGGG R R III DDDD _|__ +__|_ _|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ + | | | | | | | | | | | | | | + + +Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +Current Grid git commit hash=9d2238148c56e3fbadfa95dcabf2b83d4bde14cd: (HEAD -> develop) uncommited changes + +Grid : Message : ================================================ +Grid : Message : MPI is initialised and logging filters activated +Grid : Message : ================================================ +Grid : Message : Requested 2147483648 byte stencil comms buffers +Grid : Message : MemoryManager Cache 34004218675 bytes +Grid : Message : MemoryManager::Init() setting up +Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8 +Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory +Grid : Message : MemoryManager::Init() Using cudaMalloc +Grid : Message : 1.814936 s : Grid Layout +Grid : Message : 1.814947 s : Global lattice size : 64 64 64 256 +Grid : Message : 1.814952 s : OpenMP threads : 4 +Grid : Message : 1.814955 s : MPI tasks : 2 2 2 8 +Grid : Message : 1.859229 s : Making s innermost grids +Grid : Message : 1.907983 s : Initialising 4d RNG +Grid : Message : 1.999619 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 1.999657 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 3.786102 s : Initialising 5d RNG +Grid : Message : 5.361999 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 5.362036 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 38.698345 s : Initialised RNGs +Grid : Message : 42.821728 s : Drawing gauge field +Grid : Message : 43.916364 s : Random gauge initialised +Grid : Message : 46.410003 s : Setting up Cshift based reference +Grid : Message : 54.242661 s : ***************************************************************** +Grid : Message : 54.242686 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 54.242688 s : ***************************************************************** +Grid : Message : 54.242689 s : ***************************************************************** +Grid : Message : 54.242690 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 54.242691 s : * Vectorising space-time by 8 +Grid : Message : 54.242692 s : * VComplexF size is 64 B +Grid : Message : 54.242694 s : * SINGLE precision +Grid : Message : 54.242697 s : * Using Overlapped Comms/Compute +Grid : Message : 54.242698 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 54.242699 s : ***************************************************************** +Grid : Message : 56.314112 s : Called warmup +Grid : Message : 84.246354 s : Called Dw 3000 times in 2.79318e+07 us +Grid : Message : 84.246405 s : mflop/s = 1.52229e+08 +Grid : Message : 84.246408 s : mflop/s per rank = 2.37857e+06 +Grid : Message : 84.246412 s : mflop/s per node = 9.51428e+06 +Grid : Message : 84.246414 s : RF GiB/s (base 2) = 309325 +Grid : Message : 84.246417 s : mem GiB/s (base 2) = 193328 +Grid : Message : 84.250016 s : norm diff 1.03478e-13 +Grid : Message : 84.285132 s : #### Dhop calls report +Grid : Message : 84.285137 s : WilsonFermion5D Number of DhopEO Calls : 6002 +Grid : Message : 84.285140 s : WilsonFermion5D TotalTime /Calls : 4703.27 us +Grid : Message : 84.285142 s : WilsonFermion5D CommTime /Calls : 3131.05 us +Grid : Message : 84.285144 s : WilsonFermion5D FaceTime /Calls : 492.972 us +Grid : Message : 84.285146 s : WilsonFermion5D ComputeTime1/Calls : 56.9085 us +Grid : Message : 84.285148 s : WilsonFermion5D ComputeTime2/Calls : 1099.95 us +Grid : Message : 84.285160 s : Average mflops/s per call : 1.43412e+10 +Grid : Message : 84.285165 s : Average mflops/s per call per rank : 2.24082e+08 +Grid : Message : 84.285170 s : Average mflops/s per call per node : 8.96328e+08 +Grid : Message : 84.285173 s : Average mflops/s per call (full) : 1.53416e+08 +Grid : Message : 84.285176 s : Average mflops/s per call per rank (full): 2.39712e+06 +Grid : Message : 84.285194 s : Average mflops/s per call per node (full): 9.58847e+06 +Grid : Message : 84.285197 s : WilsonFermion5D Stencil +Grid : Message : 84.285271 s : Stencil calls 3001 +Grid : Message : 84.285275 s : Stencil halogtime 0 +Grid : Message : 84.285277 s : Stencil gathertime 55.2059 +Grid : Message : 84.285281 s : Stencil gathermtime 20.0923 +Grid : Message : 84.285283 s : Stencil mergetime 18.9057 +Grid : Message : 84.285286 s : Stencil decompresstime 0.0619793 +Grid : Message : 84.285289 s : Stencil comms_bytes 4.02653e+08 +Grid : Message : 84.285292 s : Stencil commtime 6323.57 +Grid : Message : 84.285295 s : Stencil 63.675 GB/s per rank +Grid : Message : 84.285298 s : Stencil 254.7 GB/s per node +Grid : Message : 84.285301 s : WilsonFermion5D StencilEven +Grid : Message : 84.285316 s : WilsonFermion5D StencilOdd +Grid : Message : 84.285333 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 84.285336 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 84.285337 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 106.985790 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 106.985814 s : Called DwDag +Grid : Message : 106.985815 s : norm dag result 12.0421 +Grid : Message : 107.188790 s : norm dag ref 12.0421 +Grid : Message : 107.349010 s : norm dag diff 7.63254e-14 +Grid : Message : 107.762980 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec +Grid : Message : 107.458374 s : src_e0.499998 +Grid : Message : 107.754073 s : src_o0.500002 +Grid : Message : 107.855191 s : ********************************************************* +Grid : Message : 107.855194 s : * Benchmarking DomainWallFermionF::DhopEO +Grid : Message : 107.855195 s : * Vectorising space-time by 8 +Grid : Message : 107.855197 s : * SINGLE precision +Grid : Message : 107.855198 s : * Using Overlapped Comms/Compute +Grid : Message : 107.855199 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 107.855200 s : ********************************************************* +Grid : Message : 121.549348 s : Deo mflop/s = 1.56492e+08 +Grid : Message : 121.549382 s : Deo mflop/s per rank 2.44518e+06 +Grid : Message : 121.549384 s : Deo mflop/s per node 9.78074e+06 +Grid : Message : 121.549387 s : #### Dhop calls report +Grid : Message : 121.549388 s : WilsonFermion5D Number of DhopEO Calls : 3001 +Grid : Message : 121.549390 s : WilsonFermion5D TotalTime /Calls : 4563.01 us +Grid : Message : 121.549393 s : WilsonFermion5D CommTime /Calls : 2967.77 us +Grid : Message : 121.549395 s : WilsonFermion5D FaceTime /Calls : 601.095 us +Grid : Message : 121.549397 s : WilsonFermion5D ComputeTime1/Calls : 59.9877 us +Grid : Message : 121.549399 s : WilsonFermion5D ComputeTime2/Calls : 1038.46 us +Grid : Message : 121.549423 s : Average mflops/s per call : 1.2726e+10 +Grid : Message : 121.549428 s : Average mflops/s per call per rank : 1.98843e+08 +Grid : Message : 121.549430 s : Average mflops/s per call per node : 7.95373e+08 +Grid : Message : 121.549432 s : Average mflops/s per call (full) : 1.58131e+08 +Grid : Message : 121.549436 s : Average mflops/s per call per rank (full): 2.4708e+06 +Grid : Message : 121.549440 s : Average mflops/s per call per node (full): 9.88321e+06 +Grid : Message : 121.549442 s : WilsonFermion5D Stencil +Grid : Message : 121.549453 s : WilsonFermion5D StencilEven +Grid : Message : 121.549472 s : WilsonFermion5D StencilOdd +Grid : Message : 121.549484 s : Stencil calls 3001 +Grid : Message : 121.549490 s : Stencil halogtime 0 +Grid : Message : 121.549492 s : Stencil gathertime 55.2206 +Grid : Message : 121.549496 s : Stencil gathermtime 19.4562 +Grid : Message : 121.549500 s : Stencil mergetime 18.3469 +Grid : Message : 121.549502 s : Stencil decompresstime 0.0646451 +Grid : Message : 121.549506 s : Stencil comms_bytes 2.01327e+08 +Grid : Message : 121.549510 s : Stencil commtime 2979.17 +Grid : Message : 121.549512 s : Stencil 67.5782 GB/s per rank +Grid : Message : 121.549514 s : Stencil 270.313 GB/s per node +Grid : Message : 121.549517 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 121.549519 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 121.549522 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 121.625928 s : r_e6.02108 +Grid : Message : 121.634489 s : r_o6.02101 +Grid : Message : 121.640496 s : res12.0421 +Grid : Message : 122.275455 s : norm diff 0 +Grid : Message : 123.135840 s : norm diff even 0 +Grid : Message : 123.389190 s : norm diff odd 0 diff --git a/systems/Tursa/dwf.4node.perf b/systems/Tursa/dwf.4node.perf new file mode 100644 index 00000000..9073969e --- /dev/null +++ b/systems/Tursa/dwf.4node.perf @@ -0,0 +1,245 @@ +tu-c0r0n00 - 0 device=0 binding=--interleave=0,1 +tu-c0r0n00 - 1 device=1 binding=--interleave=2,3 +tu-c0r0n09 - 1 device=1 binding=--interleave=2,3 +tu-c0r0n00 - 2 device=2 binding=--interleave=4,5 +tu-c0r0n06 - 0 device=0 binding=--interleave=0,1 +tu-c0r0n06 - 1 device=1 binding=--interleave=2,3 +tu-c0r0n09 - 0 device=0 binding=--interleave=0,1 +tu-c0r0n09 - 2 device=2 binding=--interleave=4,5 +tu-c0r0n03 - 1 device=1 binding=--interleave=2,3 +tu-c0r0n06 - 2 device=2 binding=--interleave=4,5 +tu-c0r0n09 - 3 device=3 binding=--interleave=6,7 +tu-c0r0n00 - 3 device=3 binding=--interleave=6,7 +tu-c0r0n03 - 0 device=0 binding=--interleave=0,1 +tu-c0r0n03 - 2 device=2 binding=--interleave=4,5 +tu-c0r0n06 - 3 device=3 binding=--interleave=6,7 +tu-c0r0n03 - 3 device=3 binding=--interleave=6,7 +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42505273344 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 3 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device Number : 0 +AcceleratorCudaInit[0]: ======================== +AcceleratorCudaInit[0]: Device identifier: NVIDIA A100-SXM4-40GB +AcceleratorCudaInit[0]: totalGlobalMem: 42505273344 +AcceleratorCudaInit[0]: managedMemory: 1 +AcceleratorCudaInit[0]: isMultiGpuBoard: 0 +AcceleratorCudaInit[0]: warpSize: 32 +AcceleratorCudaInit[0]: pciBusID: 3 +AcceleratorCudaInit[0]: pciDeviceID: 0 +AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535) +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +OPENMPI detected +AcceleratorCudaInit: using default device +AcceleratorCudaInit: assume user either uses a) IBM jsrun, or +AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding +AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no +AcceleratorCudaInit: ================================================ +SharedMemoryMpi: World communicator of size 16 +SharedMemoryMpi: Node communicator of size 4 +0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x7fcd80000000 for comms buffers +Setting up IPC + +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|_ | | | | | | | | | | | | _|__ +__|_ _|__ +__|_ GGGG RRRR III DDDD _|__ +__|_ G R R I D D _|__ +__|_ G R R I D D _|__ +__|_ G GG RRRR I D D _|__ +__|_ G G R R I D D _|__ +__|_ GGGG R R III DDDD _|__ +__|_ _|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ +__|__|__|__|__|__|__|__|__|__|__|__|__|__|__ + | | | | | | | | | | | | | | + + +Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +Current Grid git commit hash=9d2238148c56e3fbadfa95dcabf2b83d4bde14cd: (HEAD -> develop) uncommited changes + +Grid : Message : ================================================ +Grid : Message : MPI is initialised and logging filters activated +Grid : Message : ================================================ +Grid : Message : Requested 2147483648 byte stencil comms buffers +Grid : Message : MemoryManager Cache 34004218675 bytes +Grid : Message : MemoryManager::Init() setting up +Grid : Message : MemoryManager::Init() cache pool for recent allocations: SMALL 32 LARGE 8 +Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory +Grid : Message : MemoryManager::Init() Using cudaMalloc +Grid : Message : 1.198523 s : Grid Layout +Grid : Message : 1.198530 s : Global lattice size : 64 64 64 64 +Grid : Message : 1.198534 s : OpenMP threads : 4 +Grid : Message : 1.198535 s : MPI tasks : 2 2 2 2 +Grid : Message : 1.397615 s : Making s innermost grids +Grid : Message : 1.441828 s : Initialising 4d RNG +Grid : Message : 1.547973 s : Intialising parallel RNG with unique string 'The 4D RNG' +Grid : Message : 1.547998 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1 +Grid : Message : 1.954777 s : Initialising 5d RNG +Grid : Message : 3.633825 s : Intialising parallel RNG with unique string 'The 5D RNG' +Grid : Message : 3.633869 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a +Grid : Message : 12.162710 s : Initialised RNGs +Grid : Message : 15.882520 s : Drawing gauge field +Grid : Message : 15.816362 s : Random gauge initialised +Grid : Message : 17.279671 s : Setting up Cshift based reference +Grid : Message : 26.331426 s : ***************************************************************** +Grid : Message : 26.331452 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm +Grid : Message : 26.331454 s : ***************************************************************** +Grid : Message : 26.331456 s : ***************************************************************** +Grid : Message : 26.331458 s : * Benchmarking DomainWallFermionR::Dhop +Grid : Message : 26.331459 s : * Vectorising space-time by 8 +Grid : Message : 26.331463 s : * VComplexF size is 64 B +Grid : Message : 26.331465 s : * SINGLE precision +Grid : Message : 26.331467 s : * Using Overlapped Comms/Compute +Grid : Message : 26.331468 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 26.331469 s : ***************************************************************** +Grid : Message : 28.413717 s : Called warmup +Grid : Message : 56.418423 s : Called Dw 3000 times in 2.80047e+07 us +Grid : Message : 56.418476 s : mflop/s = 3.79581e+07 +Grid : Message : 56.418479 s : mflop/s per rank = 2.37238e+06 +Grid : Message : 56.418481 s : mflop/s per node = 9.48953e+06 +Grid : Message : 56.418483 s : RF GiB/s (base 2) = 77130 +Grid : Message : 56.418485 s : mem GiB/s (base 2) = 48206.3 +Grid : Message : 56.422076 s : norm diff 1.03481e-13 +Grid : Message : 56.456894 s : #### Dhop calls report +Grid : Message : 56.456899 s : WilsonFermion5D Number of DhopEO Calls : 6002 +Grid : Message : 56.456903 s : WilsonFermion5D TotalTime /Calls : 4710.93 us +Grid : Message : 56.456905 s : WilsonFermion5D CommTime /Calls : 3196.15 us +Grid : Message : 56.456908 s : WilsonFermion5D FaceTime /Calls : 494.392 us +Grid : Message : 56.456910 s : WilsonFermion5D ComputeTime1/Calls : 44.4107 us +Grid : Message : 56.456912 s : WilsonFermion5D ComputeTime2/Calls : 1037.75 us +Grid : Message : 56.456921 s : Average mflops/s per call : 3.55691e+09 +Grid : Message : 56.456925 s : Average mflops/s per call per rank : 2.22307e+08 +Grid : Message : 56.456928 s : Average mflops/s per call per node : 8.89228e+08 +Grid : Message : 56.456930 s : Average mflops/s per call (full) : 3.82915e+07 +Grid : Message : 56.456933 s : Average mflops/s per call per rank (full): 2.39322e+06 +Grid : Message : 56.456952 s : Average mflops/s per call per node (full): 9.57287e+06 +Grid : Message : 56.456954 s : WilsonFermion5D Stencil +Grid : Message : 56.457016 s : Stencil calls 3001 +Grid : Message : 56.457022 s : Stencil halogtime 0 +Grid : Message : 56.457024 s : Stencil gathertime 55.9154 +Grid : Message : 56.457026 s : Stencil gathermtime 20.1073 +Grid : Message : 56.457028 s : Stencil mergetime 18.5585 +Grid : Message : 56.457030 s : Stencil decompresstime 0.0639787 +Grid : Message : 56.457032 s : Stencil comms_bytes 4.02653e+08 +Grid : Message : 56.457034 s : Stencil commtime 6379.93 +Grid : Message : 56.457036 s : Stencil 63.1124 GB/s per rank +Grid : Message : 56.457038 s : Stencil 252.45 GB/s per node +Grid : Message : 56.457040 s : WilsonFermion5D StencilEven +Grid : Message : 56.457048 s : WilsonFermion5D StencilOdd +Grid : Message : 56.457062 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 56.457065 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 56.457066 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 79.259261 s : Compare to naive wilson implementation Dag to verify correctness +Grid : Message : 79.259287 s : Called DwDag +Grid : Message : 79.259288 s : norm dag result 12.0421 +Grid : Message : 79.271740 s : norm dag ref 12.0421 +Grid : Message : 79.287759 s : norm dag diff 7.63236e-14 +Grid : Message : 79.328100 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec +Grid : Message : 79.955951 s : src_e0.499997 +Grid : Message : 80.633620 s : src_o0.500003 +Grid : Message : 80.164163 s : ********************************************************* +Grid : Message : 80.164168 s : * Benchmarking DomainWallFermionF::DhopEO +Grid : Message : 80.164170 s : * Vectorising space-time by 8 +Grid : Message : 80.164172 s : * SINGLE precision +Grid : Message : 80.164174 s : * Using Overlapped Comms/Compute +Grid : Message : 80.164177 s : * Using GENERIC Nc WilsonKernels +Grid : Message : 80.164178 s : ********************************************************* +Grid : Message : 93.797635 s : Deo mflop/s = 3.93231e+07 +Grid : Message : 93.797670 s : Deo mflop/s per rank 2.45769e+06 +Grid : Message : 93.797672 s : Deo mflop/s per node 9.83077e+06 +Grid : Message : 93.797674 s : #### Dhop calls report +Grid : Message : 93.797675 s : WilsonFermion5D Number of DhopEO Calls : 3001 +Grid : Message : 93.797677 s : WilsonFermion5D TotalTime /Calls : 4542.83 us +Grid : Message : 93.797679 s : WilsonFermion5D CommTime /Calls : 2978.97 us +Grid : Message : 93.797681 s : WilsonFermion5D FaceTime /Calls : 602.287 us +Grid : Message : 93.797683 s : WilsonFermion5D ComputeTime1/Calls : 67.1416 us +Grid : Message : 93.797685 s : WilsonFermion5D ComputeTime2/Calls : 1004.07 us +Grid : Message : 93.797713 s : Average mflops/s per call : 3.30731e+09 +Grid : Message : 93.797717 s : Average mflops/s per call per rank : 2.06707e+08 +Grid : Message : 93.797719 s : Average mflops/s per call per node : 8.26827e+08 +Grid : Message : 93.797721 s : Average mflops/s per call (full) : 3.97084e+07 +Grid : Message : 93.797727 s : Average mflops/s per call per rank (full): 2.48178e+06 +Grid : Message : 93.797732 s : Average mflops/s per call per node (full): 9.92711e+06 +Grid : Message : 93.797735 s : WilsonFermion5D Stencil +Grid : Message : 93.797746 s : WilsonFermion5D StencilEven +Grid : Message : 93.797758 s : WilsonFermion5D StencilOdd +Grid : Message : 93.797769 s : Stencil calls 3001 +Grid : Message : 93.797773 s : Stencil halogtime 0 +Grid : Message : 93.797776 s : Stencil gathertime 56.7458 +Grid : Message : 93.797780 s : Stencil gathermtime 22.6504 +Grid : Message : 93.797782 s : Stencil mergetime 21.1913 +Grid : Message : 93.797786 s : Stencil decompresstime 0.0556481 +Grid : Message : 93.797788 s : Stencil comms_bytes 2.01327e+08 +Grid : Message : 93.797791 s : Stencil commtime 2989.33 +Grid : Message : 93.797795 s : Stencil 67.3484 GB/s per rank +Grid : Message : 93.797798 s : Stencil 269.394 GB/s per node +Grid : Message : 93.797801 s : WilsonFermion5D Stencil Reporti() +Grid : Message : 93.797803 s : WilsonFermion5D StencilEven Reporti() +Grid : Message : 93.797805 s : WilsonFermion5D StencilOdd Reporti() +Grid : Message : 93.873429 s : r_e6.02111 +Grid : Message : 93.879931 s : r_o6.02102 +Grid : Message : 93.885912 s : res12.0421 +Grid : Message : 94.876555 s : norm diff 0 +Grid : Message : 95.485643 s : norm diff even 0 +Grid : Message : 95.581236 s : norm diff odd 0 diff --git a/systems/Tursa/dwf16.slurm b/systems/Tursa/dwf16.slurm new file mode 100644 index 00000000..a35e55be --- /dev/null +++ b/systems/Tursa/dwf16.slurm @@ -0,0 +1,33 @@ +#!/bin/bash +#SBATCH -J dslash +#SBATCH -A tc002 +#SBATCH -t 2:20:00 +#SBATCH --exclusive +#SBATCH --nodes=16 +#SBATCH --ntasks=64 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=8 +#SBATCH --time=12:00:00 +#SBATCH --partition=gpu +#SBATCH --gres=gpu:4 +#SBATCH --output=%x.%j.out +#SBATCH --error=%x.%j.err + +export OMP_NUM_THREADS=4 +export OMPI_MCA_btl=^uct,openib +export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=yes +export UCX_MEMTYPE_CACHE=n +OPT="--comms-overlap --comms-concurrent" + + +mpirun -np $SLURM_NTASKS -x LD_LIBRARY_PATH --bind-to none ./mpiwrapper.sh \ + ./benchmarks/Benchmark_dwf_fp32 \ + $OPT \ + --mpi 2.2.2.8 \ + --accelerator-threads 8 \ + --grid 64.64.64.256 \ + --shm 2048 > dwf.16node.perf + diff --git a/systems/Tursa/dwf4.slurm b/systems/Tursa/dwf4.slurm new file mode 100644 index 00000000..65191398 --- /dev/null +++ b/systems/Tursa/dwf4.slurm @@ -0,0 +1,38 @@ +#!/bin/bash +#SBATCH -J dslash +#SBATCH -A tc002 +#SBATCH -t 2:20:00 +#SBATCH --nodelist=tu-c0r0n[00,03,06,09] +#SBATCH --exclusive +#SBATCH --nodes=4 +#SBATCH --ntasks=16 +#SBATCH --ntasks-per-node=4 +#SBATCH --cpus-per-task=8 +#SBATCH --time=12:00:00 +#SBATCH --partition=gpu +#SBATCH --gres=gpu:4 +#SBATCH --output=%x.%j.out +#SBATCH --error=%x.%j.err + +export OMP_NUM_THREADS=4 +export OMPI_MCA_btl=^uct,openib +export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=yes +export UCX_MEMTYPE_CACHE=n +OPT="--comms-overlap --comms-concurrent" + + +mpirun -np $SLURM_NTASKS -x LD_LIBRARY_PATH --bind-to none \ + ./mpiwrapper.sh \ + ./benchmarks/Benchmark_dwf_fp32 \ + $OPT \ + --mpi 2.2.2.2 \ + --accelerator-threads 8 \ + --grid 64.64.64.64 \ + --shm 2048 > dwf.4node.perf + + + + diff --git a/systems/Tursa/mpiwrapper.sh b/systems/Tursa/mpiwrapper.sh new file mode 100755 index 00000000..4d96ac67 --- /dev/null +++ b/systems/Tursa/mpiwrapper.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +lrank=$OMPI_COMM_WORLD_LOCAL_RANK +numa1=$(( 2 * $lrank)) +numa2=$(( 2 * $lrank + 1 )) +netdev=mlx5_${lrank}:1 + +export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK +export UCX_NET_DEVICES=mlx5_${lrank}:1 +BINDING="--interleave=$numa1,$numa2" + +echo "`hostname` - $lrank device=$CUDA_VISIBLE_DEVICES binding=$BINDING" + +numactl ${BINDING} $* + + + diff --git a/systems/Tursa/sourceme.sh b/systems/Tursa/sourceme.sh new file mode 100644 index 00000000..6286750d --- /dev/null +++ b/systems/Tursa/sourceme.sh @@ -0,0 +1,2 @@ +spack load c-lime +module load cuda/11.4.1 openmpi/4.1.1 ucx/1.10.1 diff --git a/tests/IO/Test_serialisation.cc b/tests/IO/Test_serialisation.cc index 27fe589e..e1596ea6 100644 --- a/tests/IO/Test_serialisation.cc +++ b/tests/IO/Test_serialisation.cc @@ -48,7 +48,9 @@ public: std::vector, array, std::vector >, twodimarray, std::vector> > >, cmplx3darray, - SpinColourMatrix, scm + SpinColourMatrix, scm, + std::vector > >, ragged, + std::vector >, vscm ); myclass() {} myclass(int i) @@ -56,6 +58,10 @@ public: , twodimarray(3,std::vector(5, 1.23456)) , cmplx3darray(3,std::vector>>(5, std::vector>(7, std::complex(1.2, 3.4)))) , ve(2, myenum::blue) + , ragged( {{{i+1},{i+2,i+3}}, // ragged + {{i+4,i+5,i+6,i+7},{i+8,i+9,i+10,i+11},{i+12,i+13,i+14,i+15}}, // block + {{i+16,i+17},{i+18,i+19,i+20}}} ) //ragged + , vscm(3, std::vector(5)) { e=myenum::red; x=i; @@ -68,6 +74,13 @@ public: scm()(0, 2)(1, 1) = 6.336; scm()(2, 1)(2, 2) = 7.344; scm()(1, 1)(2, 0) = 8.3534; + int Counter = i; + for( auto & v : vscm ) { + for( auto & j : v ) { + j = std::complex(Counter, -Counter); + Counter++; + } + } } }; diff --git a/tests/Test_dwf_mixedcg_prec_halfcomms.cc b/tests/Test_dwf_mixedcg_prec_halfcomms.cc index 8b0126dc..ff52b0d1 100644 --- a/tests/Test_dwf_mixedcg_prec_halfcomms.cc +++ b/tests/Test_dwf_mixedcg_prec_halfcomms.cc @@ -29,19 +29,12 @@ Author: Peter Boyle using namespace std; using namespace Grid; - ; -template -struct scal { - d internal; -}; +#if 1 +int main (int argc, char ** argv) {} + +#else - Gamma::Algebra Gmu [] = { - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaT - }; int main (int argc, char ** argv) { @@ -124,3 +117,4 @@ int main (int argc, char ** argv) Grid_finalize(); } +#endif diff --git a/tests/Test_meson_field.cc b/tests/Test_meson_field.cc new file mode 100644 index 00000000..25d908d7 --- /dev/null +++ b/tests/Test_meson_field.cc @@ -0,0 +1,148 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: tests/core/Test_meson_field.cc + +Copyright (C) 2015-2018 + +Author: Felix Erben + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ + +#include +#include + +using namespace Grid; + +const int TSRC = 0; //timeslice where rho is nonzero +const int VDIM = 5; //length of each vector + +typedef typename DomainWallFermionR::ComplexField ComplexField; +typedef typename DomainWallFermionR::FermionField FermionField; + +int main(int argc, char *argv[]) +{ + // initialization + Grid_init(&argc, &argv); + std::cout << GridLogMessage << "Grid initialized" << std::endl; + + // Lattice and rng setup + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout = GridDefaultSimd(4, vComplex::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + GridCartesian grid(latt_size,simd_layout,mpi_layout); + int Nt = GridDefaultLatt()[Tp]; + Lattice> t(&grid); + LatticeCoordinate(t, Tp); + std::vector seeds({1,2,3,4}); + GridParallelRNG pRNG(&grid); + pRNG.SeedFixedIntegers(seeds); + + // MesonField lhs and rhs vectors + std::vector phi(VDIM,&grid); + std::vector rho(VDIM,&grid); + FermionField rho_tmp(&grid); + std::cout << GridLogMessage << "Initialising random meson fields" << std::endl; + for (unsigned int i = 0; i < VDIM; ++i){ + random(pRNG,phi[i]); + random(pRNG,rho_tmp); //ideally only nonzero on t=0 + rho[i] = where((t==TSRC), rho_tmp, 0.*rho_tmp); //ideally only nonzero on t=0 + } + std::cout << GridLogMessage << "Meson fields initialised, rho non-zero only for t = " << TSRC << std::endl; + + // Gamma matrices used in the contraction + std::vector Gmu = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + // momentum phases e^{ipx} + std::vector> momenta = { + {0.,0.,0.}, + {1.,0.,0.}, + {1.,1.,0.}, + {1.,1.,1.}, + {2.,0.,0.} + }; + + std::cout << GridLogMessage << "Meson fields will be created for " << Gmu.size() << " Gamma matrices and " << momenta.size() << " momenta." << std::endl; + + std::cout << GridLogMessage << "Computing complex phases" << std::endl; + std::vector phases(momenta.size(),&grid); + ComplexField coor(&grid); + Complex Ci(0.0,1.0); + for (unsigned int j = 0; j < momenta.size(); ++j) + { + phases[j] = Zero(); + for(unsigned int mu = 0; mu < momenta[j].size(); mu++) + { + LatticeCoordinate(coor, mu); + phases[j] = phases[j] + momenta[j][mu]/GridDefaultLatt()[mu]*coor; + } + phases[j] = exp((Real)(2*M_PI)*Ci*phases[j]); + } + std::cout << GridLogMessage << "Computing complex phases done." << std::endl; + + Eigen::Tensor Mpp(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); + Eigen::Tensor Mpr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); + Eigen::Tensor Mrr(momenta.size(),Gmu.size(),Nt,VDIM,VDIM); + + // timer + double start,stop; + + //execute meson field routine + start = usecond(); + A2Autils::MesonField(Mpp,&phi[0],&phi[0],Gmu,phases,Tp); + stop = usecond(); + std::cout << GridLogMessage << "M(phi,phi) created, execution time " << stop-start << " us" << std::endl; + start = usecond(); + /* Ideally, for this meson field we could pass TSRC (even better a list of timeslices) + * to the routine so that all the compnents which are predictably equal to zero are not computed. */ + A2Autils::MesonField(Mpr,&phi[0],&rho[0],Gmu,phases,Tp); + stop = usecond(); + std::cout << GridLogMessage << "M(phi,rho) created, execution time " << stop-start << " us" << std::endl; + start = usecond(); + A2Autils::MesonField(Mrr,&rho[0],&rho[0],Gmu,phases,Tp); + stop = usecond(); + std::cout << GridLogMessage << "M(rho,rho) created, execution time " << stop-start << " us" << std::endl; + + std::string FileName = "Meson_Fields"; +#ifdef HAVE_HDF5 + using Default_Reader = Grid::Hdf5Reader; + using Default_Writer = Grid::Hdf5Writer; + FileName.append(".h5"); +#else + using Default_Reader = Grid::BinaryReader; + using Default_Writer = Grid::BinaryWriter; + FileName.append(".bin"); +#endif + + Default_Writer w(FileName); + write(w,"phi_phi",Mpp); + write(w,"phi_rho",Mpr); + write(w,"rho_rho",Mrr); + + // epilogue + std::cout << GridLogMessage << "Grid is finalizing now" << std::endl; + Grid_finalize(); + + return EXIT_SUCCESS; +}