diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index 2cbc895c..3c2eb428 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -168,6 +168,7 @@ public: template void FFT_dim(Lattice &result,const Lattice &source,int dim, int sign){ #ifndef HAVE_FFTW + std::cerr << "FFTW is not compiled but is called"< pgbuf(&pencil_g); autoView(pgbuf_v , pgbuf, CpuWrite); - + std::cout << "CPU view" << std::endl; + typedef typename FFTW::FFTW_scalar FFTW_scalar; typedef typename FFTW::FFTW_plan FFTW_plan; @@ -213,6 +215,7 @@ public: else if ( sign == forward ) div = 1.0; else assert(0); + std::cout << "Making FFTW plan" << std::endl; FFTW_plan p; { FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; @@ -226,6 +229,7 @@ public: } // Barrel shift and collect global pencil + std::cout << "Making pencil" << std::endl; Coordinate lcoor(Nd), gcoor(Nd); result = source; int pc = processor_coor[dim]; @@ -247,6 +251,7 @@ public: } } + std::cout << "Looping orthog" << std::endl; // Loop over orthog coords int NN=pencil_g.lSites(); GridStopWatch timer; @@ -269,6 +274,7 @@ public: usec += timer.useconds(); flops+= flops_call*NN; + std::cout << "Writing back results " << std::endl; // writing out result { autoView(pgbuf_v,pgbuf,CpuRead); @@ -285,6 +291,7 @@ public: } result = result*div; + std::cout << "Destroying plan " << std::endl; // destroying plan FFTW::fftw_destroy_plan(p); #endif diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h index d41eb279..e00e94c9 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShift.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShift.h @@ -102,11 +102,11 @@ public: assert(mass.size()==nshift); assert(mresidual.size()==nshift); - // dynamic sized arrays on stack; 2d is a pain with vector - RealD bs[nshift]; - RealD rsq[nshift]; - RealD z[nshift][2]; - int converged[nshift]; + // remove dynamic sized arrays on stack; 2d is a pain with vector + std::vector bs(nshift); + std::vector rsq(nshift); + std::vector > z(nshift); + std::vector converged(nshift); const int primary =0; diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h index 23baff61..c6102eb2 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h @@ -123,11 +123,11 @@ public: assert(mresidual.size()==nshift); // dynamic sized arrays on stack; 2d is a pain with vector - RealD bs[nshift]; - RealD rsq[nshift]; - RealD rsqf[nshift]; - RealD z[nshift][2]; - int converged[nshift]; + std::vector bs(nshift); + std::vector rsq(nshift); + std::vector rsqf(nshift); + std::vector > z(nshift); + std::vector converged(nshift); const int primary =0; diff --git a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h index d3fb282a..24a3228a 100644 --- a/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h +++ b/Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h @@ -156,11 +156,11 @@ public: assert(mresidual.size()==nshift); // dynamic sized arrays on stack; 2d is a pain with vector - RealD bs[nshift]; - RealD rsq[nshift]; - RealD rsqf[nshift]; - RealD z[nshift][2]; - int converged[nshift]; + std::vector bs(nshift); + std::vector rsq(nshift); + std::vector rsqf(nshift); + std::vector > z(nshift); + std::vector converged(nshift); const int primary =0; diff --git a/Grid/algorithms/multigrid/CoarsenedMatrix.h b/Grid/algorithms/multigrid/CoarsenedMatrix.h index 42634004..60a5920c 100644 --- a/Grid/algorithms/multigrid/CoarsenedMatrix.h +++ b/Grid/algorithms/multigrid/CoarsenedMatrix.h @@ -99,7 +99,7 @@ public: CoarseMatrix AselfInvEven; CoarseMatrix AselfInvOdd; - Vector dag_factor; + deviceVector dag_factor; /////////////////////// // Interface @@ -124,9 +124,13 @@ public: int npoint = geom.npoint; typedef LatticeView Aview; - Vector AcceleratorViewContainer; + deviceVector AcceleratorViewContainer(geom.npoint); + hostVector hAcceleratorViewContainer(geom.npoint); - for(int p=0;p Aview; - Vector AcceleratorViewContainer; - for(int p=0;p AcceleratorViewContainer(geom.npoint); + hostVector hAcceleratorViewContainer(geom.npoint); + + for(int p=0;poSites(); - Vector points(geom.npoint, 0); - for(int p=0; p points(geom.npoint); + for(int p=0; p Aview; - Vector AcceleratorViewContainer; - for(int p=0;p AcceleratorViewContainer(geom.npoint); + hostVector hAcceleratorViewContainer(geom.npoint); + + for(int p=0;p &out) { @@ -469,14 +484,20 @@ public: // determine in what order we need the points int npoint = geom.npoint-1; - Vector points(npoint, 0); - for(int p=0; p points(npoint); + for(int p=0; p AcceleratorViewContainer; - for(int p=0;p AcceleratorViewContainer(geom.npoint); + hostVector hAcceleratorViewContainer(geom.npoint); + + for(int p=0;p h_dag_factor(nbasis*nbasis); thread_for(i, nbasis*nbasis, { int j = i/nbasis; int k = i%nbasis; - dag_factor[i] = dag_factor_eigen(j, k); + h_dag_factor[i] = dag_factor_eigen(j, k); }); + acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD)); } void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase > &linop, diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 293ce2fb..8946a364 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -174,21 +174,11 @@ template inline bool operator!=(const devAllocator<_Tp>&, const d //////////////////////////////////////////////////////////////////////////////// // Template typedefs //////////////////////////////////////////////////////////////////////////////// -#ifdef ACCELERATOR_CSHIFT -// Cshift on device -template using cshiftAllocator = devAllocator; -#else -// Cshift on host -template using cshiftAllocator = std::allocator; -#endif +template using hostVector = std::vector >; // Needs autoview +template using Vector = std::vector >; // +template using uvmVector = std::vector >; // auto migrating page +template using deviceVector = std::vector >; // device vector -template using Vector = std::vector >; -template using stencilVector = std::vector >; -template using commVector = std::vector >; -template using deviceVector = std::vector >; -template using cshiftVector = std::vector >; - -/* template class vecView { protected: @@ -197,8 +187,9 @@ template class vecView ViewMode mode; void * cpu_ptr; public: + // Rvalue accessor accelerator_inline T & operator[](size_t i) const { return this->data[i]; }; - vecView(std::vector &refer_to_me,ViewMode _mode) + vecView(Vector &refer_to_me,ViewMode _mode) { cpu_ptr = &refer_to_me[0]; size = refer_to_me.size(); @@ -214,26 +205,15 @@ template class vecView } }; -template vecView VectorView(std::vector &vec,ViewMode _mode) +template vecView VectorView(Vector &vec,ViewMode _mode) { vecView ret(vec,_mode); // does the open return ret; // must be closed } -// Little autoscope assister -template -class VectorViewCloser -{ - View v; // Take a copy of view and call view close when I go out of scope automatically - public: - VectorViewCloser(View &_v) : v(_v) {}; - ~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);} -}; - #define autoVecView(v_v,v,mode) \ auto v_v = VectorView(v,mode); \ ViewCloser _autoView##v_v(v_v); -*/ NAMESPACE_END(Grid); diff --git a/Grid/allocator/MemoryStats.cc b/Grid/allocator/MemoryStats.cc index 0d1707d9..37269785 100644 --- a/Grid/allocator/MemoryStats.cc +++ b/Grid/allocator/MemoryStats.cc @@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES) uint64_t virt_pfn = (uint64_t)Buf / page_size; off_t offset = sizeof(uint64_t) * virt_pfn; uint64_t npages = (BYTES + page_size-1) / page_size; - uint64_t pagedata[npages]; + std::vector pagedata(npages); uint64_t ret = lseek(fd, offset, SEEK_SET); assert(ret == offset); - ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); + ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages); assert(ret == sizeof(uint64_t) * npages); int nhugepages = npages / 512; int n4ktotal, nnothuge; diff --git a/Grid/cshift/Cshift.h b/Grid/cshift/Cshift.h index c7b9e3cb..ae1dea51 100644 --- a/Grid/cshift/Cshift.h +++ b/Grid/cshift/Cshift.h @@ -51,7 +51,6 @@ Author: Peter Boyle #endif NAMESPACE_BEGIN(Grid); - template::value,void>::type * = nullptr> auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr)) { diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index 309517b2..fdb98cd4 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -30,12 +30,11 @@ Author: Peter Boyle NAMESPACE_BEGIN(Grid); extern std::vector > Cshift_table; -extern commVector > Cshift_table_device; +extern deviceVector > Cshift_table_device; inline std::pair *MapCshiftTable(void) { // GPU version -#ifdef ACCELERATOR_CSHIFT uint64_t sz=Cshift_table.size(); if (Cshift_table_device.size()!=sz ) { Cshift_table_device.resize(sz); @@ -45,16 +44,13 @@ inline std::pair *MapCshiftTable(void) sizeof(Cshift_table[0])*sz); return &Cshift_table_device[0]; -#else - return &Cshift_table[0]; -#endif // CPU version use identify map } /////////////////////////////////////////////////////////////////// // Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// template void -Gather_plane_simple (const Lattice &rhs,cshiftVector &buffer,int dimension,int plane,int cbmask, int off=0) +Gather_plane_simple (const Lattice &rhs,deviceVector &buffer,int dimension,int plane,int cbmask, int off=0) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -94,17 +90,10 @@ Gather_plane_simple (const Lattice &rhs,cshiftVector &buffer,int dim { auto buffer_p = & buffer[0]; auto table = MapCshiftTable(); -#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); }); -#else - autoView(rhs_v , rhs, CpuRead); - thread_for(i,ent,{ - buffer_p[table[i].first]=rhs_v[table[i].second]; - }); -#endif } } @@ -129,7 +118,6 @@ Gather_plane_extract(const Lattice &rhs, int n1=rhs.Grid()->_slice_stride[dimension]; if ( cbmask ==0x3){ -#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); accelerator_for(nn,e1*e2,1,{ int n = nn%e1; @@ -140,21 +128,10 @@ Gather_plane_extract(const Lattice &rhs, vobj temp =rhs_v[so+o+b]; extract(temp,pointers,offset); }); -#else - autoView(rhs_v , rhs, CpuRead); - thread_for2d(n,e1,b,e2,{ - int o = n*n1; - int offset = b+n*e2; - - vobj temp =rhs_v[so+o+b]; - extract(temp,pointers,offset); - }); -#endif } else { Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate cdm =rhs.Grid()->_checker_dim_mask; std::cout << " Dense packed buffer WARNING " < &rhs, extract(temp,pointers,offset); } }); -#else - autoView(rhs_v , rhs, CpuRead); - thread_for2d(n,e1,b,e2,{ - - Coordinate coor; - - int o=n*n1; - int oindex = o+b; - - int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm); - - int ocb=1<(temp,pointers,offset); - } - }); -#endif } } ////////////////////////////////////////////////////// // Scatter for when there is no need to SIMD split ////////////////////////////////////////////////////// -template void Scatter_plane_simple (Lattice &rhs,cshiftVector &buffer, int dimension,int plane,int cbmask) +template void Scatter_plane_simple (Lattice &rhs,deviceVector &buffer, int dimension,int plane,int cbmask) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -245,17 +202,10 @@ template void Scatter_plane_simple (Lattice &rhs,cshiftVector< { auto buffer_p = & buffer[0]; auto table = MapCshiftTable(); -#ifdef ACCELERATOR_CSHIFT autoView( rhs_v, rhs, AcceleratorWrite); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); }); -#else - autoView( rhs_v, rhs, CpuWrite); - thread_for(i,ent,{ - rhs_v[table[i].first]=buffer_p[table[i].second]; - }); -#endif } } @@ -278,7 +228,6 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA if(cbmask ==0x3 ) { int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension]; -#ifdef ACCELERATOR_CSHIFT autoView( rhs_v , rhs, AcceleratorWrite); accelerator_for(nn,e1*e2,1,{ int n = nn%e1; @@ -287,14 +236,6 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA int offset = b+n*_slice_block; merge(rhs_v[so+o+b],pointers,offset); }); -#else - autoView( rhs_v , rhs, CpuWrite); - thread_for2d(n,e1,b,e2,{ - int o = n*_slice_stride; - int offset = b+n*_slice_block; - merge(rhs_v[so+o+b],pointers,offset); - }); -#endif } else { // Case of SIMD split AND checker dim cannot currently be hit, except in @@ -360,19 +301,11 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs { auto table = MapCshiftTable(); -#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); autoView(lhs_v , lhs, AcceleratorWrite); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); }); -#else - autoView(rhs_v , rhs, CpuRead); - autoView(lhs_v , lhs, CpuWrite); - thread_for(i,ent,{ - lhs_v[table[i].first]=rhs_v[table[i].second]; - }); -#endif } } @@ -412,19 +345,11 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice Lattice Cshift(const Lattice &rhs,int dimension RealD t1,t0; t0=usecond(); if ( !comm_dim ) { - //std::cout << "CSHIFT: Cshift_local" < void Cshift_comms(Lattice& ret,const Lattice &r sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); - // std::cout << "Cshift_comms dim "< void Cshift_comms_simd(Lattice& ret,const LatticeCheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); - //std::cout << "Cshift_comms_simd dim "< void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) { typedef typename vobj::vector_type vector_type; @@ -125,8 +123,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - static cshiftVector send_buf; send_buf.resize(buffer_size); - static cshiftVector recv_buf; recv_buf.resize(buffer_size); + static deviceVector send_buf; send_buf.resize(buffer_size); + static deviceVector recv_buf; recv_buf.resize(buffer_size); int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); @@ -161,7 +159,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); tcomms-=usecond(); - // grid->Barrier(); + grid->Barrier(); grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, @@ -169,7 +167,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r recv_from_rank, bytes); xbytes+=bytes; - // grid->Barrier(); + grid->Barrier(); tcomms+=usecond(); tscatter-=usecond(); @@ -177,13 +175,11 @@ template void Cshift_comms(Lattice &ret,const Lattice &r tscatter+=usecond(); } } - /* std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -201,9 +197,9 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_simd_layout[dimension]; int comm_dim = grid->_processors[dimension] >1 ; - //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "< void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); - static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); + static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); + static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); scalar_object * recv_buf_extract_mpi; scalar_object * send_buf_extract_mpi; @@ -281,7 +277,7 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); tcomms-=usecond(); - // grid->Barrier(); + grid->Barrier(); send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; recv_buf_extract_mpi = &recv_buf_extract[i][0]; @@ -292,7 +288,7 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeBarrier(); + grid->Barrier(); tcomms+=usecond(); rpointers[i] = &recv_buf_extract[i][0]; @@ -305,242 +301,12 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) -{ - typedef typename vobj::vector_type vector_type; - typedef typename vobj::scalar_type scalar_type; - - GridBase *grid=rhs.Grid(); - Lattice temp(rhs.Grid()); - - int fd = rhs.Grid()->_fdimensions[dimension]; - int rd = rhs.Grid()->_rdimensions[dimension]; - int pd = rhs.Grid()->_processors[dimension]; - int simd_layout = rhs.Grid()->_simd_layout[dimension]; - int comm_dim = rhs.Grid()->_processors[dimension] >1 ; - assert(simd_layout==1); - assert(comm_dim==1); - assert(shift>=0); - assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - static cshiftVector send_buf_v; send_buf_v.resize(buffer_size); - static cshiftVector recv_buf_v; recv_buf_v.resize(buffer_size); - vobj *send_buf; - vobj *recv_buf; - { - grid->ShmBufferFreeAll(); - size_t bytes = buffer_size*sizeof(vobj); - send_buf=(vobj *)grid->ShmBufferMalloc(bytes); - recv_buf=(vobj *)grid->ShmBufferMalloc(bytes); - } - - int cb= (cbmask==0x2)? Odd : Even; - int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - - for(int x=0;x>1; - - int bytes = words * sizeof(vobj); - - tgather-=usecond(); - Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); - tgather+=usecond(); - - // int rank = grid->_processor; - int recv_from_rank; - int xmit_to_rank; - grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - - - tcomms-=usecond(); - // grid->Barrier(); - - acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); - grid->SendToRecvFrom((void *)&send_buf[0], - xmit_to_rank, - (void *)&recv_buf[0], - recv_from_rank, - bytes); - xbytes+=bytes; - acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); - - // grid->Barrier(); - tcomms+=usecond(); - - tscatter-=usecond(); - Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); - tscatter+=usecond(); - } - } - /* - std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) -{ - GridBase *grid=rhs.Grid(); - const int Nsimd = grid->Nsimd(); - typedef typename vobj::vector_type vector_type; - typedef typename vobj::scalar_object scalar_object; - typedef typename vobj::scalar_type scalar_type; - - int fd = grid->_fdimensions[dimension]; - int rd = grid->_rdimensions[dimension]; - int ld = grid->_ldimensions[dimension]; - int pd = grid->_processors[dimension]; - int simd_layout = grid->_simd_layout[dimension]; - int comm_dim = grid->_processors[dimension] >1 ; - - //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<=0); - assert(shiftPermuteType(dimension); - - /////////////////////////////////////////////// - // Simd direction uses an extract/merge pair - /////////////////////////////////////////////// - int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; - // int words = sizeof(vobj)/sizeof(vector_type); - - static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); - static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); - scalar_object * recv_buf_extract_mpi; - scalar_object * send_buf_extract_mpi; - { - size_t bytes = sizeof(scalar_object)*buffer_size; - grid->ShmBufferFreeAll(); - send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); - recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); - } - for(int s=0;s pointers(Nsimd); // - ExtractPointerArray rpointers(Nsimd); // received pointers - - /////////////////////////////////////////// - // Work out what to send where - /////////////////////////////////////////// - int cb = (cbmask==0x2)? Odd : Even; - int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - - // loop over outer coord planes orthog to dim - for(int x=0;x>(permute_type+1)); - int ic= (i&inner_bit)? 1:0; - - int my_coor = rd*ic + x; - int nbr_coor = my_coor+sshift; - int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors - - int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer - int nbr_ox = (nbr_coor%rd); // outer coord of peer - int nbr_lane = (i&(~inner_bit)); - - int recv_from_rank; - int xmit_to_rank; - - if (nbr_ic) nbr_lane|=inner_bit; - - assert (sx == nbr_ox); - - if(nbr_proc){ - grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - - tcomms-=usecond(); - // grid->Barrier(); - - acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); - grid->SendToRecvFrom((void *)send_buf_extract_mpi, - xmit_to_rank, - (void *)recv_buf_extract_mpi, - recv_from_rank, - bytes); - acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); - xbytes+=bytes; - - // grid->Barrier(); - tcomms+=usecond(); - rpointers[i] = &recv_buf_extract[i][0]; - } else { - rpointers[i] = &send_buf_extract[nbr_lane][0]; - } - - } - tscatter-=usecond(); - Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); - tscatter+=usecond(); - - } - /* - std::cout << GridLogPerformance << " Cshift (s) copy "< NAMESPACE_BEGIN(Grid); std::vector > Cshift_table; -commVector > Cshift_table_device; +deviceVector > Cshift_table_device; NAMESPACE_END(Grid); diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index 03a869fb..c9c65928 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) typedef decltype(basis[0]) Field; typedef decltype(basis[0].View(AcceleratorRead)) View; - Vector basis_v; basis_v.reserve(basis.size()); - typedef typename std::remove_reference::type vobj; + hostVector h_basis_v(basis.size()); + deviceVector d_basis_v(basis.size()); + typedef typename std::remove_reference::type vobj; typedef typename std::remove_reference::type Coeff_t; + GridBase* grid = basis[0].Grid(); for(int k=0;k Bt(Nm * max_threads); - thread_region - { - vobj* B = &Bt[Nm * thread_num()]; - thread_for_in_region(ss, grid->oSites(),{ - for(int j=j0; joSites(); uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead - Vector Bt(siteBlock * nrot); + deviceVector Bt(siteBlock * nrot); auto Bp=&Bt[0]; // GPU readable copy of matrix - Vector Qt_jv(Nm*Nm); + hostVector h_Qt_jv(Nm*Nm); + deviceVector Qt_jv(Nm*Nm); Coeff_t *Qt_p = & Qt_jv[0]; thread_for(i,Nm*Nm,{ int j = i/Nm; int k = i%Nm; - Qt_p[i]=Qt(j,k); + h_Qt_jv[i]=Qt(j,k); }); + acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t)); // Block the loop to keep storage footprint down for(uint64_t s=0;s &basis,Eigen::MatrixXd& Qt,in result.Checkerboard() = basis[0].Checkerboard(); - Vector basis_v; basis_v.reserve(basis.size()); + hostVector h_basis_v(basis.size()); + deviceVector d_basis_v(basis.size()); for(int k=0;k Qt_jv(Nm); - double * Qt_j = & Qt_jv[0]; - for(int k=0;k Qt_jv(Nm); + double * Qt_j = & Qt_jv[0]; + for(int k=0;koSites(),vobj::Nsimd(),{ vobj zzz=Zero(); @@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,in } coalescedWrite(result_v[ss], B); }); - for(int k=0;k diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 58004eac..92eb0562 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites) // const int Nsimd = vobj::Nsimd(); const int nthread = GridThread::GetThreads(); - Vector sumarray(nthread); + std::vector sumarray(nthread); for(int i=0;i sumarray(nthread); + std::vector sumarray(nthread); for(int i=0;i &z,sobj a,sobj b,const Lattice &x,const Latt autoView( x_v, x, AcceleratorRead); autoView( y_v, y, AcceleratorRead); autoView( z_v, z, AcceleratorWrite); -#if 0 - typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; - Vector inner_tmp(sites); - auto inner_tmp_v = &inner_tmp[0]; - - accelerator_for( ss, sites, nsimd,{ - auto tmp = a*x_v(ss)+b*y_v(ss); - coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp)); - coalescedWrite(z_v[ss],tmp); - }); - nrm = real(TensorRemove(sum(inner_tmp_v,sites))); -#else typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; deviceVector inner_tmp; inner_tmp.resize(sites); @@ -366,7 +354,6 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt coalescedWrite(z_v[ss],tmp); }); nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); -#endif grid->GlobalSum(nrm); return nrm; } @@ -377,7 +364,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Latti conformable(left,right); typedef typename vobj::vector_typeD vector_type; - Vector tmp(2); + std::vector tmp(2); GridBase *grid = left.Grid(); @@ -387,8 +374,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Latti // GPU typedef decltype(innerProductD(vobj(),vobj())) inner_t; typedef decltype(innerProductD(vobj(),vobj())) norm_t; - Vector inner_tmp(sites); - Vector norm_tmp(sites); + deviceVector inner_tmp(sites); + deviceVector norm_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; auto norm_tmp_v = &norm_tmp[0]; { @@ -438,7 +425,9 @@ inline auto sum(const LatticeTrinaryExpression & expr) // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... ////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template inline void sliceSum(const Lattice &Data,std::vector &result,int orthogdim) +template inline void sliceSum(const Lattice &Data, + std::vector &result, + int orthogdim) { /////////////////////////////////////////////////////// // FIXME precision promoted summation @@ -460,8 +449,8 @@ template inline void sliceSum(const Lattice &Data,std::vector< int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; - Vector lvSum(rd); // will locally sum vectors first - Vector lsSum(ld,Zero()); // sum across these down to scalars + std::vector lvSum(rd); // will locally sum vectors first + std::vector lsSum(ld,Zero()); // sum across these down to scalars ExtractBuffer extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node @@ -552,8 +541,8 @@ static void sliceInnerProductVector( std::vector & result, const Latti int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; - Vector lvSum(rd); // will locally sum vectors first - Vector lsSum(ld,scalar_type(0.0)); // sum across these down to scalars + std::vector lvSum(rd); // will locally sum vectors first + std::vector lsSum(ld,scalar_type(0.0)); // sum across these down to scalars ExtractBuffer > extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node for IO to file diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index e82494f5..91cb8226 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -214,22 +214,12 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi // Move out of UVM // Turns out I had messed up the synchronise after move to compute stream // as running this on the default stream fools the synchronise -#undef UVM_BLOCK_BUFFER -#ifndef UVM_BLOCK_BUFFER - commVector buffer(numBlocks); + deviceVector buffer(numBlocks); sobj *buffer_v = &buffer[0]; sobj result; reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); accelerator_barrier(); acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); -#else - Vector buffer(numBlocks); - sobj *buffer_v = &buffer[0]; - sobj result; - reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); - accelerator_barrier(); - result = *buffer_v; -#endif return result; } @@ -244,7 +234,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi const int words = sizeof(vobj)/sizeof(vector); - Vector buffer(osites); + deviceVector buffer(osites); vector *dat = (vector *)lat; vector *buf = &buffer[0]; iScalar *tbuf =(iScalar *) &buffer[0]; diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h index 7dff7939..3718e6ea 100644 --- a/Grid/lattice/Lattice_reduction_sycl.h +++ b/Grid/lattice/Lattice_reduction_sycl.h @@ -10,7 +10,7 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os { typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_objectD sobjD; -#if 1 + sobj identity; zeroit(identity); sobj ret; zeroit(ret); Integer nsimd= vobj::Nsimd(); @@ -28,32 +28,6 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os } sobjD dret; convertType(dret,ret); return dret; -#else - static Vector mysum; - mysum.resize(1); - sobj *mysum_p = & mysum[0]; - sobj identity; zeroit(identity); - acceleratorPut(mysum[0],identity); - sobj ret ; - - Integer nsimd= vobj::Nsimd(); - - const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList); - cgh.parallel_for(cl::sycl::range<1>{osites}, - Reduction, - [=] (cl::sycl::id<1> item, auto &sum) { - auto osite = item[0]; - sum +=Reduce(lat[osite]); - }); - }); - theGridAccelerator->wait(); - ret = mysum[0]; - // free(mysum,*theGridAccelerator); - sobjD dret; convertType(dret,ret); - return dret; -#endif } template @@ -97,7 +71,6 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite template Word svm_xor(Word *vec,uint64_t L) { -#if 1 Word identity; identity=0; Word ret = 0; { @@ -113,60 +86,7 @@ template Word svm_xor(Word *vec,uint64_t L) } theGridAccelerator->wait(); return ret; -#else - static Vector d_sum; - d_sum.resize(1); - Word *d_sum_p=&d_sum[0]; - Word identity; identity=0; - acceleratorPut(d_sum[0],identity); - const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() }); - theGridAccelerator->submit([&](cl::sycl::handler &cgh) { - auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList); - cgh.parallel_for(cl::sycl::range<1>{L}, - Reduction, - [=] (cl::sycl::id<1> index, auto &sum) { - sum^=vec[index]; - }); - }); - theGridAccelerator->wait(); - Word ret = acceleratorGet(d_sum[0]); - // free(d_sum,*theGridAccelerator); - return ret; -#endif } NAMESPACE_END(Grid); -/* - -template -inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites) -{ - typedef typename vobj::vector_type vector; - typedef typename vobj::scalar_type scalar; - - typedef typename vobj::scalar_typeD scalarD; - typedef typename vobj::scalar_objectD sobjD; - - sobjD ret; - scalarD *ret_p = (scalarD *)&ret; - - const int nsimd = vobj::Nsimd(); - const int words = sizeof(vobj)/sizeof(vector); - - Vector buffer(osites*nsimd); - scalar *buf = &buffer[0]; - vector *dat = (vector *)lat; - - for(int w=0;w inline void sliceSumReduction_cub_small(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { +template +inline void sliceSumReduction_cub_small(const vobj *Data, + std::vector &lvSum, + const int rd, + const int e1, + const int e2, + const int stride, + const int ostride, + const int Nsimd) +{ size_t subvol_size = e1*e2; - commVector reduction_buffer(rd*subvol_size); + deviceVector reduction_buffer(rd*subvol_size); auto rb_p = &reduction_buffer[0]; vobj zero_init; zeroit(zero_init); @@ -94,7 +103,15 @@ template inline void sliceSumReduction_cub_small(const vobj *Data, V #if defined(GRID_SYCL) -template inline void sliceSumReduction_sycl_small(const vobj *Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +template +inline void sliceSumReduction_sycl_small(const vobj *Data, + std::vector &lvSum, + const int &rd, + const int &e1, + const int &e2, + const int &stride, + const int &ostride, + const int &Nsimd) { size_t subvol_size = e1*e2; @@ -105,7 +122,7 @@ template inline void sliceSumReduction_sycl_small(const vobj *Data, mysum[r] = vobj_zero; } - commVector reduction_buffer(rd*subvol_size); + deviceVector reduction_buffer(rd*subvol_size); auto rb_p = &reduction_buffer[0]; @@ -144,14 +161,23 @@ template inline void sliceSumReduction_sycl_small(const vobj *Data, } #endif -template inline void sliceSumReduction_large(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { +template +inline void sliceSumReduction_large(const vobj *Data, + std::vector &lvSum, + const int rd, + const int e1, + const int e2, + const int stride, + const int ostride, + const int Nsimd) +{ typedef typename vobj::vector_type vector; const int words = sizeof(vobj)/sizeof(vector); const int osites = rd*e1*e2; - commVectorbuffer(osites); + deviceVectorbuffer(osites); vector *dat = (vector *)Data; vector *buf = &buffer[0]; - Vector lvSum_small(rd); + std::vector lvSum_small(rd); vector *lvSum_ptr = (vector *)&lvSum[0]; for (int w = 0; w < words; w++) { @@ -168,13 +194,18 @@ template inline void sliceSumReduction_large(const vobj *Data, Vecto for (int r = 0; r < rd; r++) { lvSum_ptr[w+words*r]=lvSum_small[r]; } - } - - } -template inline void sliceSumReduction_gpu(const Lattice &Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) +template +inline void sliceSumReduction_gpu(const Lattice &Data, + std::vector &lvSum, + const int rd, + const int e1, + const int e2, + const int stride, + const int ostride, + const int Nsimd) { autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. if constexpr (sizeof(vobj) <= 256) { @@ -192,7 +223,15 @@ template inline void sliceSumReduction_gpu(const Lattice &Data } -template inline void sliceSumReduction_cpu(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +template +inline void sliceSumReduction_cpu(const Lattice &Data, + std::vector &lvSum, + const int &rd, + const int &e1, + const int &e2, + const int &stride, + const int &ostride, + const int &Nsimd) { // sum over reduced dimension planes, breaking out orthog dir // Parallel over orthog direction @@ -208,16 +247,20 @@ template inline void sliceSumReduction_cpu(const Lattice &Data }); } -template inline void sliceSumReduction(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +template inline void sliceSumReduction(const Lattice &Data, + std::vector &lvSum, + const int &rd, + const int &e1, + const int &e2, + const int &stride, + const int &ostride, + const int &Nsimd) { - #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) - +#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); - - #else +#else sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); - - #endif +#endif } diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h index ad1496f5..c7dcbac9 100644 --- a/Grid/lattice/PaddedCell.h +++ b/Grid/lattice/PaddedCell.h @@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase inline void ScatterSlice(const cshiftVector &buf, +template inline void ScatterSlice(const deviceVector &buf, Lattice &lat, int x, int dim, @@ -140,7 +140,7 @@ template inline void ScatterSlice(const cshiftVector &buf, }); } -template inline void GatherSlice(cshiftVector &buf, +template inline void GatherSlice(deviceVector &buf, const Lattice &lat, int x, int dim, @@ -462,8 +462,8 @@ public: int rNsimd = Nsimd / simd[dimension]; assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]); - static cshiftVector send_buf; - static cshiftVector recv_buf; + static deviceVector send_buf; + static deviceVector recv_buf; send_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth); diff --git a/Grid/qcd/action/fermion/CayleyFermion5D.h b/Grid/qcd/action/fermion/CayleyFermion5D.h index cf39ec99..2c56c7ed 100644 --- a/Grid/qcd/action/fermion/CayleyFermion5D.h +++ b/Grid/qcd/action/fermion/CayleyFermion5D.h @@ -90,16 +90,16 @@ public: void M5D(const FermionField &psi, const FermionField &phi, FermionField &chi, - Vector &lower, - Vector &diag, - Vector &upper); + std::vector &lower, + std::vector &diag, + std::vector &upper); void M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi, - Vector &lower, - Vector &diag, - Vector &upper); + std::vector &lower, + std::vector &diag, + std::vector &upper); virtual void Instantiatable(void)=0; @@ -119,35 +119,35 @@ public: RealD mass_plus, mass_minus; // Save arguments to SetCoefficientsInternal - Vector _gamma; + std::vector _gamma; RealD _zolo_hi; RealD _b; RealD _c; // Cayley form Moebius (tanh and zolotarev) - Vector omega; - Vector bs; // S dependent coeffs - Vector cs; - Vector as; + std::vector omega; + std::vector bs; // S dependent coeffs + std::vector cs; + std::vector as; // For preconditioning Cayley form - Vector bee; - Vector cee; - Vector aee; - Vector beo; - Vector ceo; - Vector aeo; + std::vector bee; + std::vector cee; + std::vector aee; + std::vector beo; + std::vector ceo; + std::vector aeo; // LDU factorisation of the eeoo matrix - Vector lee; - Vector leem; - Vector uee; - Vector ueem; - Vector dee; + std::vector lee; + std::vector leem; + std::vector uee; + std::vector ueem; + std::vector dee; // Matrices of 5d ee inverse params - Vector > MatpInv; - Vector > MatmInv; - Vector > MatpInvDag; - Vector > MatmInvDag; + // std::vector > MatpInv; + // std::vector > MatmInv; + // std::vector > MatpInvDag; + // std::vector > MatmInvDag; /////////////////////////////////////////////////////////////// // Conserved current utilities @@ -187,7 +187,7 @@ public: protected: virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c); - virtual void SetCoefficientsInternal(RealD zolo_hi,Vector & gamma,RealD b,RealD c); + virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector & gamma,RealD b,RealD c); }; NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h index 2300afd3..bfc0dd8b 100644 --- a/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h +++ b/Grid/qcd/action/fermion/ContinuedFractionFermion5D.h @@ -90,12 +90,12 @@ protected: RealD mass; RealD R; RealD ZoloHiInv; - Vector Beta; - Vector cc;; - Vector cc_d;; - Vector sqrt_cc; - Vector See; - Vector Aee; + std::vector Beta; + std::vector cc;; + std::vector cc_d;; + std::vector sqrt_cc; + std::vector See; + std::vector Aee; }; diff --git a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h index bcc97176..ff2420d5 100644 --- a/Grid/qcd/action/fermion/DomainWallEOFAFermion.h +++ b/Grid/qcd/action/fermion/DomainWallEOFAFermion.h @@ -69,10 +69,10 @@ public: // Instantiate different versions depending on Impl ///////////////////////////////////////////////////// void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper); + std::vector& lower, std::vector& diag, std::vector& upper); void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper); + std::vector& lower, std::vector& diag, std::vector& upper); virtual void RefreshShiftCoefficients(RealD new_shift); @@ -83,7 +83,7 @@ public: RealD _M5, const ImplParams& p=ImplParams()); protected: - void SetCoefficientsInternal(RealD zolo_hi, Vector& gamma, RealD b, RealD c); + void SetCoefficientsInternal(RealD zolo_hi, std::vector& gamma, RealD b, RealD c); }; NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h index 60cfc727..f7655f24 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion.h @@ -102,11 +102,11 @@ public: GaugeField &mat, const FermionField &A, const FermionField &B, int dag); - void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag); - void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag); - void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, + void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag); ////////////////////////////////////////////////////////////////////////// @@ -164,8 +164,6 @@ public: DoubledGaugeField UUUmuEven; DoubledGaugeField UUUmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; /////////////////////////////////////////////////////////////// // Conserved current utilities diff --git a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h index 5b26b35c..2641a6b8 100644 --- a/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h +++ b/Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h @@ -100,7 +100,6 @@ public: int dag); void DhopInternal(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -108,7 +107,6 @@ public: int dag); void DhopInternalOverlappedComms(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -116,7 +114,6 @@ public: int dag); void DhopInternalSerialComms(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -192,8 +189,6 @@ public: DoubledGaugeField UUUmuEven; DoubledGaugeField UUUmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; // Comms buffer // std::vector > comm_buf; diff --git a/Grid/qcd/action/fermion/MobiusEOFAFermion.h b/Grid/qcd/action/fermion/MobiusEOFAFermion.h index 6e4f79eb..39c21643 100644 --- a/Grid/qcd/action/fermion/MobiusEOFAFermion.h +++ b/Grid/qcd/action/fermion/MobiusEOFAFermion.h @@ -42,11 +42,11 @@ public: public: // Shift operator coefficients for red-black preconditioned Mobius EOFA - Vector Mooee_shift; - Vector MooeeInv_shift_lc; - Vector MooeeInv_shift_norm; - Vector MooeeInvDag_shift_lc; - Vector MooeeInvDag_shift_norm; + std::vector Mooee_shift; + std::vector MooeeInv_shift_lc; + std::vector MooeeInv_shift_norm; + std::vector MooeeInvDag_shift_lc; + std::vector MooeeInvDag_shift_norm; virtual void Instantiatable(void) {}; @@ -74,18 +74,18 @@ public: // Instantiate different versions depending on Impl ///////////////////////////////////////////////////// void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper); + std::vector& lower, std::vector& diag, std::vector& upper); void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper, - Vector& shift_coeffs); + std::vector& lower, std::vector& diag, std::vector& upper, + std::vector& shift_coeffs); void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper); + std::vector& lower, std::vector& diag, std::vector& upper); void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, - Vector& lower, Vector& diag, Vector& upper, - Vector& shift_coeffs); + std::vector& lower, std::vector& diag, std::vector& upper, + std::vector& shift_coeffs); virtual void RefreshShiftCoefficients(RealD new_shift); diff --git a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h index 5f69c2b1..9ec6be90 100644 --- a/Grid/qcd/action/fermion/NaiveStaggeredFermion.h +++ b/Grid/qcd/action/fermion/NaiveStaggeredFermion.h @@ -102,11 +102,11 @@ public: GaugeField &mat, const FermionField &A, const FermionField &B, int dag); - void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + void DhopInternal(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); - void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); - void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); ////////////////////////////////////////////////////////////////////////// @@ -152,9 +152,6 @@ public: DoubledGaugeField UmuEven; DoubledGaugeField UmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; - /////////////////////////////////////////////////////////////// // Conserved current utilities /////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/PartialFractionFermion5D.h b/Grid/qcd/action/fermion/PartialFractionFermion5D.h index 54f8547f..e50a9922 100644 --- a/Grid/qcd/action/fermion/PartialFractionFermion5D.h +++ b/Grid/qcd/action/fermion/PartialFractionFermion5D.h @@ -94,8 +94,8 @@ protected: RealD R; RealD amax; RealD scale; - Vector p; - Vector q; + std::vector p; + std::vector q; }; diff --git a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h index 1545c245..00ac222f 100644 --- a/Grid/qcd/action/fermion/SchurDiagTwoKappa.h +++ b/Grid/qcd/action/fermion/SchurDiagTwoKappa.h @@ -35,7 +35,7 @@ template class KappaSimilarityTransform { public: INHERIT_IMPL_TYPES(Matrix); - Vector kappa, kappaDag, kappaInv, kappaInvDag; + std::vector kappa, kappaDag, kappaInv, kappaInvDag; KappaSimilarityTransform (Matrix &zmob) { for (int i=0;i<(int)zmob.bs.size();i++) { diff --git a/Grid/qcd/action/fermion/StaggeredKernels.h b/Grid/qcd/action/fermion/StaggeredKernels.h index d67105bb..c609be03 100644 --- a/Grid/qcd/action/fermion/StaggeredKernels.h +++ b/Grid/qcd/action/fermion/StaggeredKernels.h @@ -49,10 +49,10 @@ template class StaggeredKernels : public FermionOperator , pub public: - void DhopImproved(StencilImpl &st, LebesgueOrder &lo, + void DhopImproved(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag, int interior,int exterior); - void DhopNaive(StencilImpl &st, LebesgueOrder &lo, + void DhopNaive(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag, int interior,int exterior); diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 186fa278..baa1f684 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -47,7 +47,7 @@ public: static int PartialCompressionFactor(GridBase *grid) { return 1;} #endif template - static void Gather_plane_simple (commVector >& table, + static void Gather_plane_simple (deviceVector >& table, const Lattice &rhs, cobj *buffer, compressor &compress, @@ -109,7 +109,7 @@ public: // Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2. //////////////////////////////////////////////////////////////////////////////////////////// template - static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { @@ -197,7 +197,7 @@ public: #endif template - static void Gather_plane_simple (commVector >& table, + static void Gather_plane_simple (deviceVector >& table, const Lattice &rhs, cobj *buffer, compressor &compress, @@ -208,7 +208,7 @@ public: else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial); } template - static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { @@ -402,7 +402,6 @@ public: typedef CartesianStencil Base; typedef typename Base::View_type View_type; - typedef typename Base::StencilVector StencilVector; // Vector surface_list; WilsonStencil(GridBase *grid, diff --git a/Grid/qcd/action/fermion/WilsonFermion.h b/Grid/qcd/action/fermion/WilsonFermion.h index a7a1bb69..16320a93 100644 --- a/Grid/qcd/action/fermion/WilsonFermion.h +++ b/Grid/qcd/action/fermion/WilsonFermion.h @@ -126,14 +126,17 @@ public: void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat, const FermionField &A, const FermionField &B, int dag); - void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, + void DhopInternal(StencilImpl &st, + DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); - void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - const FermionField &in, FermionField &out, int dag); + void DhopInternalSerial(StencilImpl &st, + DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); - void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, - const FermionField &in, FermionField &out, int dag); + void DhopInternalOverlappedComms(StencilImpl &st, + DoubledGaugeField &U, + const FermionField &in, FermionField &out, int dag); // Constructor WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, @@ -168,9 +171,6 @@ public: DoubledGaugeField UmuEven; DoubledGaugeField UmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; - WilsonAnisotropyCoefficients anisotropyCoeff; /////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/WilsonFermion5D.h b/Grid/qcd/action/fermion/WilsonFermion5D.h index 0b07d320..d614b290 100644 --- a/Grid/qcd/action/fermion/WilsonFermion5D.h +++ b/Grid/qcd/action/fermion/WilsonFermion5D.h @@ -135,21 +135,18 @@ public: int dag); void DhopInternal(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); void DhopInternalOverlappedComms(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag); void DhopInternalSerialComms(StencilImpl & st, - LebesgueOrder &lo, DoubledGaugeField &U, const FermionField &in, FermionField &out, @@ -203,9 +200,6 @@ public: DoubledGaugeField UmuEven; DoubledGaugeField UmuOdd; - LebesgueOrder Lebesgue; - LebesgueOrder LebesgueEvenOdd; - // Comms buffer // std::vector > comm_buf; diff --git a/Grid/qcd/action/fermion/ZMobiusFermion.h b/Grid/qcd/action/fermion/ZMobiusFermion.h index fc8a7439..f8d1f11f 100644 --- a/Grid/qcd/action/fermion/ZMobiusFermion.h +++ b/Grid/qcd/action/fermion/ZMobiusFermion.h @@ -58,7 +58,7 @@ public: { // RealD eps = 1.0; std::cout< zgamma(this->Ls); + std::vector zgamma(this->Ls); for(int s=0;sLs;s++){ zgamma[s] = gamma[s]; } diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dvec.h similarity index 99% rename from Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h rename to Grid/qcd/action/fermion/deprecated/CayleyFermion5Dvec.h index e3bf67db..478fbb8b 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dvec.h +++ b/Grid/qcd/action/fermion/deprecated/CayleyFermion5Dvec.h @@ -1,3 +1,5 @@ +#if 0 + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -818,3 +820,5 @@ CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi, } NAMESPACE_END(Grid); + +#endif diff --git a/Grid/stencil/Lebesgue.cc b/Grid/qcd/action/fermion/deprecated/Lebesgue.cc similarity index 99% rename from Grid/stencil/Lebesgue.cc rename to Grid/qcd/action/fermion/deprecated/Lebesgue.cc index 656ecca8..480483ed 100644 --- a/Grid/stencil/Lebesgue.cc +++ b/Grid/qcd/action/fermion/deprecated/Lebesgue.cc @@ -1,3 +1,4 @@ +#if 0 /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -241,3 +242,4 @@ void LebesgueOrder::ZGraph(void) } NAMESPACE_END(Grid); +#endif diff --git a/Grid/stencil/Lebesgue.h b/Grid/qcd/action/fermion/deprecated/Lebesgue.h similarity index 97% rename from Grid/stencil/Lebesgue.h rename to Grid/qcd/action/fermion/deprecated/Lebesgue.h index 25fa772e..0416ad80 100644 --- a/Grid/stencil/Lebesgue.h +++ b/Grid/qcd/action/fermion/deprecated/Lebesgue.h @@ -72,7 +72,7 @@ public: void ThreadInterleave(void); private: - Vector _LebesgueReorder; + deviceVector _LebesgueReorder; }; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index 2b8a3a18..8dc4fbc8 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -156,18 +156,18 @@ template void CayleyFermion5D::M5D (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag (Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1]=mass_minus; - Vector lower(Ls,-1.0); lower[0] =mass_plus; + std::vector diag (Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1]=mass_minus; + std::vector lower(Ls,-1.0); lower[0] =mass_plus; M5D(psi,chi,chi,lower,diag,upper); } template void CayleyFermion5D::Meooe5D (const FermionField &psi, FermionField &Din) { int Ls=this->Ls; - Vector diag = bs; - Vector upper= cs; - Vector lower= cs; + std::vector diag = bs; + std::vector upper= cs; + std::vector lower= cs; upper[Ls-1]=-mass_minus*upper[Ls-1]; lower[0] =-mass_plus*lower[0]; M5D(psi,psi,Din,lower,diag,upper); @@ -176,9 +176,9 @@ void CayleyFermion5D::Meooe5D (const FermionField &psi, FermionField &D template void CayleyFermion5D::Meo5D (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag = beo; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = beo; + std::vector upper(Ls); + std::vector lower(Ls); for(int i=0;i void CayleyFermion5D::Mooee (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag = bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int i=0;i void CayleyFermion5D::MooeeDag (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag = bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = bee; + std::vector upper(Ls); + std::vector lower(Ls); for (int s=0;s void CayleyFermion5D::M5Ddag (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); - Vector lower(Ls,-1.0); + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); + std::vector lower(Ls,-1.0); upper[Ls-1]=-mass_plus*upper[Ls-1]; lower[0] =-mass_minus*lower[0]; M5Ddag(psi,chi,chi,lower,diag,upper); @@ -248,9 +248,9 @@ template void CayleyFermion5D::MeooeDag5D (const FermionField &psi, FermionField &Din) { int Ls=this->Ls; - Vector diag =bs; - Vector upper=cs; - Vector lower=cs; + std::vector diag =bs; + std::vector upper=cs; + std::vector lower=cs; for (int s=0;s::MeoDeriv(GaugeField &mat,const FermionField &U,const template void CayleyFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c) { - Vector gamma(this->Ls); + std::vector gamma(this->Ls); for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; SetCoefficientsInternal(1.0,gamma,b,c); } @@ -402,13 +402,13 @@ void CayleyFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re template void CayleyFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c) { - Vector gamma(this->Ls); + std::vector gamma(this->Ls); for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; SetCoefficientsInternal(zolo_hi,gamma,b,c); } //Zolo template -void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,Vector & gamma,RealD b,RealD c) +void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector & gamma,RealD b,RealD c) { int Ls=this->Ls; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h index 0d2516c4..d3d88cbf 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5Dcache.h @@ -43,9 +43,9 @@ void CayleyFermion5D::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, - Vector &diag, - Vector &upper) + std::vector &lower, + std::vector &diag, + std::vector &upper) { chi_i.Checkerboard()=psi_i.Checkerboard(); @@ -55,12 +55,16 @@ CayleyFermion5D::M5D(const FermionField &psi_i, autoView(chi , chi_i,AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; - int Ls =this->Ls; + static deviceVector d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; + // 10 = 3 complex mult + 2 complex add // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) uint64_t nloop = grid->oSites(); @@ -82,9 +86,9 @@ void CayleyFermion5D::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, - Vector &diag, - Vector &upper) + std::vector &lower, + std::vector &diag, + std::vector &upper) { chi_i.Checkerboard()=psi_i.Checkerboard(); GridBase *grid=psi_i.Grid(); @@ -93,12 +97,16 @@ CayleyFermion5D::M5Ddag(const FermionField &psi_i, autoView(chi , chi_i,AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; - int Ls=this->Ls; + static deviceVector d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; + // Flops = 6.0*(Nc*Ns) *Ls*vol uint64_t nloop = grid->oSites(); accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -126,11 +134,17 @@ CayleyFermion5D::MooeeInv (const FermionField &psi_i, FermionField &chi int Ls=this->Ls; - auto plee = & lee [0]; - auto pdee = & dee [0]; - auto puee = & uee [0]; - auto pleem = & leem[0]; - auto pueem = & ueem[0]; + static deviceVector d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; uint64_t nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -182,11 +196,17 @@ CayleyFermion5D::MooeeInvDag (const FermionField &psi_i, FermionField &chi autoView(psi , psi_i,AcceleratorRead); autoView(chi , chi_i,AcceleratorWrite); - auto plee = & lee [0]; - auto pdee = & dee [0]; - auto puee = & uee [0]; - auto pleem = & leem[0]; - auto pueem = & ueem[0]; + static deviceVector d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; assert(psi.Checkerboard() == psi.Checkerboard()); diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h index 6b8336cc..8a9a0ffa 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionCache.h @@ -41,7 +41,7 @@ NAMESPACE_BEGIN(Grid); // Pplus backwards.. template void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i, - Vector& lower, Vector& diag, Vector& upper) + std::vector& lower, std::vector& diag, std::vector& upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); int Ls = this->Ls; @@ -50,9 +50,15 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi autoView( psi , psi_i, AcceleratorRead); autoView( chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; + + static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; + // Flops = 6.0*(Nc*Ns) *Ls*vol auto nloop=grid->oSites()/Ls; @@ -73,7 +79,7 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi_i, const FermionFi template void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i, - Vector& lower, Vector& diag, Vector& upper) + std::vector& lower, std::vector& diag, std::vector& upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase* grid = psi_i.Grid(); @@ -83,9 +89,14 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi_i, const Fermio autoView( phi , phi_i, AcceleratorRead); autoView( chi , chi_i, AcceleratorWrite); assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; + + static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol @@ -114,13 +125,18 @@ void DomainWallEOFAFermion::MooeeInv(const FermionField& psi_i, FermionFie autoView( chi, chi_i, AcceleratorWrite); int Ls = this->Ls; - auto plee = & this->lee[0]; - auto pdee = & this->dee[0]; - auto puee = & this->uee[0]; - - auto pleem = & this->leem[0]; - auto pueem = & this->ueem[0]; + static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; + uint64_t nloop=grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ uint64_t ss=sss*Ls; diff --git a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h index 64ee4033..53b44ca2 100644 --- a/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/DomainWallEOFAFermionImplementation.h @@ -131,9 +131,9 @@ void DomainWallEOFAFermion::M5D(const FermionField& psi, FermionField& chi else{ shiftm = -shift*(mq3-mq2); } } - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm; - Vector lower(Ls,-1.0); lower[0] = mq1 + shiftp; + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm; + std::vector lower(Ls,-1.0); lower[0] = mq1 + shiftp; #if(0) std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl; @@ -168,9 +168,9 @@ void DomainWallEOFAFermion::M5Ddag(const FermionField& psi, FermionField& else{ shiftm = -shift*(mq3-mq2); } } - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp; - Vector lower(Ls,-1.0); lower[0] = mq1 + shiftm; + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp; + std::vector lower(Ls,-1.0); lower[0] = mq1 + shiftm; this->M5Ddag(psi, chi, chi, lower, diag, upper); } @@ -181,9 +181,9 @@ void DomainWallEOFAFermion::Mooee(const FermionField& psi, FermionField& c { int Ls = this->Ls; - Vector diag = this->bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = this->bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int s=0; scee[s]; @@ -200,9 +200,9 @@ void DomainWallEOFAFermion::MooeeDag(const FermionField& psi, FermionField { int Ls = this->Ls; - Vector diag = this->bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = this->bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int s=0; scee[s]; @@ -218,7 +218,7 @@ void DomainWallEOFAFermion::MooeeDag(const FermionField& psi, FermionField //Zolo template -void DomainWallEOFAFermion::SetCoefficientsInternal(RealD zolo_hi, Vector& gamma, RealD b, RealD c) +void DomainWallEOFAFermion::SetCoefficientsInternal(RealD zolo_hi, std::vector& gamma, RealD b, RealD c) { int Ls = this->Ls; int pm = this->pm; diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h index d235abbb..d2b4450e 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermion5DImplementation.h @@ -61,8 +61,6 @@ ImprovedStaggeredFermion5D::ImprovedStaggeredFermion5D(GridCartesian UUUmu(&FourDimGrid), UUUmuEven(&FourDimRedBlackGrid), UUUmuOdd(&FourDimRedBlackGrid), - Lebesgue(&FourDimGrid), - LebesgueEvenOdd(&FourDimRedBlackGrid), _tmp(&FiveDimRedBlackGrid) { @@ -277,18 +275,18 @@ void ImprovedStaggeredFermion5D::DhopDerivOE(GaugeField &mat, /*CHANGE */ template -void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, +void ImprovedStaggeredFermion5D::DhopInternal(StencilImpl & st, DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); + DhopInternalOverlappedComms(st,U,UUU,in,out,dag); else - DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); + DhopInternalSerialComms(st,U,UUU,in,out,dag); } template -void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, +void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & st, DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { @@ -313,7 +311,7 @@ void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & { int interior=1; int exterior=0; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } st.CommsMerge(compressor); @@ -323,12 +321,12 @@ void ImprovedStaggeredFermion5D::DhopInternalOverlappedComms(StencilImpl & { int interior=0; int exterior=1; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } } template -void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, +void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, DoubledGaugeField & U,DoubledGaugeField & UUU, const FermionField &in, FermionField &out,int dag) { @@ -341,7 +339,7 @@ void ImprovedStaggeredFermion5D::DhopInternalSerialComms(StencilImpl & st, { int interior=1; int exterior=1; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } } /*CHANGE END*/ @@ -357,7 +355,7 @@ void ImprovedStaggeredFermion5D::DhopOE(const FermionField &in, FermionFie assert(in.Checkerboard()==Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag); + DhopInternal(StencilEven,UmuOdd,UUUmuOdd,in,out,dag); } template void ImprovedStaggeredFermion5D::DhopEO(const FermionField &in, FermionField &out,int dag) @@ -368,7 +366,7 @@ void ImprovedStaggeredFermion5D::DhopEO(const FermionField &in, FermionFie assert(in.Checkerboard()==Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag); + DhopInternal(StencilOdd,UmuEven,UUUmuEven,in,out,dag); } template void ImprovedStaggeredFermion5D::Dhop(const FermionField &in, FermionField &out,int dag) @@ -378,7 +376,7 @@ void ImprovedStaggeredFermion5D::Dhop(const FermionField &in, FermionField out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag); + DhopInternal(Stencil,Umu,UUUmu,in,out,dag); } ///////////////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h index 4c80a1d5..bd9dd132 100644 --- a/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h @@ -48,8 +48,6 @@ ImprovedStaggeredFermion::ImprovedStaggeredFermion(GridCartesian &Fgrid, G StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd mass(_mass), - Lebesgue(_grid), - LebesgueEvenOdd(_cbgrid), Umu(&Fgrid), UmuEven(&Hgrid), UmuOdd(&Hgrid), @@ -339,7 +337,7 @@ void ImprovedStaggeredFermion::Dhop(const FermionField &in, FermionField & out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag); + DhopInternal(Stencil, Umu, UUUmu, in, out, dag); } template @@ -351,7 +349,7 @@ void ImprovedStaggeredFermion::DhopOE(const FermionField &in, FermionField assert(in.Checkerboard() == Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag); + DhopInternal(StencilEven, UmuOdd, UUUmuOdd, in, out, dag); } template @@ -363,7 +361,7 @@ void ImprovedStaggeredFermion::DhopEO(const FermionField &in, FermionField assert(in.Checkerboard() == Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag); + DhopInternal(StencilOdd, UmuEven, UUUmuEven, in, out, dag); } template @@ -394,19 +392,19 @@ void ImprovedStaggeredFermion::DhopDir(const FermionField &in, FermionFiel template -void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, +void ImprovedStaggeredFermion::DhopInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag) { if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag); + DhopInternalOverlappedComms(st,U,UUU,in,out,dag); else - DhopInternalSerialComms(st,lo,U,UUU,in,out,dag); + DhopInternalSerialComms(st,U,UUU,in,out,dag); } template -void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, +void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -429,7 +427,7 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st { int interior=1; int exterior=0; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } st.CommunicateComplete(requests); @@ -440,13 +438,13 @@ void ImprovedStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st { int interior=0; int exterior=1; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } } template -void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, +void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, @@ -460,7 +458,7 @@ void ImprovedStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, Le { int interior=1; int exterior=1; - Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); + Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior); } }; diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h index 617a18df..4827e516 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionCache.h @@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid); template void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, Vector &diag, Vector &upper) + std::vector &lower, std::vector &diag, std::vector &upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -50,9 +50,13 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; + static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol int nloop = grid->oSites()/Ls; @@ -74,8 +78,8 @@ void MobiusEOFAFermion::M5D(const FermionField &psi_i, const FermionField template void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, Vector &diag, Vector &upper, - Vector &shift_coeffs) + std::vector &lower, std::vector &diag, std::vector &upper, + std::vector &shift_coeffs) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -86,13 +90,18 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion auto pm = this->pm; int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator - + assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; - auto pshift_coeffs = &shift_coeffs[0]; + static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + static deviceVector d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; + auto pshift_coeffs = &d_shift_coeffs[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol int nloop = grid->oSites()/Ls; @@ -119,7 +128,7 @@ void MobiusEOFAFermion::M5D_shift(const FermionField &psi_i, const Fermion template void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, Vector &diag, Vector &upper) + std::vector &lower, std::vector &diag, std::vector &upper) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -130,9 +139,13 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; + static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol int nloop = grid->oSites()/Ls; @@ -154,8 +167,8 @@ void MobiusEOFAFermion::M5Ddag(const FermionField &psi_i, const FermionFie template void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i, - Vector &lower, Vector &diag, Vector &upper, - Vector &shift_coeffs) + std::vector &lower, std::vector &diag, std::vector &upper, + std::vector &shift_coeffs) { chi_i.Checkerboard() = psi_i.Checkerboard(); GridBase *grid = psi_i.Grid(); @@ -167,10 +180,15 @@ void MobiusEOFAFermion::M5Ddag_shift(const FermionField &psi_i, const Ferm assert(phi.Checkerboard() == psi.Checkerboard()); - auto pdiag = &diag[0]; - auto pupper = &upper[0]; - auto plower = &lower[0]; - auto pshift_coeffs = &shift_coeffs[0]; + static deviceVector d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t)); + static deviceVector d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t)); + static deviceVector d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t)); + static deviceVector d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t)); + + auto pdiag = &d_diag[0]; + auto pupper = &d_upper[0]; + auto plower = &d_lower[0]; + auto pshift_coeffs = &d_shift_coeffs[0]; // Flops = 6.0*(Nc*Ns) *Ls*vol auto pm = this->pm; @@ -212,11 +230,17 @@ void MobiusEOFAFermion::MooeeInv(const FermionField &psi_i, FermionField & autoView(psi , psi_i, AcceleratorRead); autoView(chi , chi_i, AcceleratorWrite); - auto plee = & this->lee [0]; - auto pdee = & this->dee [0]; - auto puee = & this->uee [0]; - auto pleem= & this->leem[0]; - auto pueem= & this->ueem[0]; + static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; } @@ -268,14 +292,24 @@ void MobiusEOFAFermion::MooeeInv_shift(const FermionField &psi_i, FermionF autoView(psi , psi_i, AcceleratorRead); autoView(chi , chi_i, AcceleratorWrite); + // Move into object and constructor + static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + auto pm = this->pm; - auto plee = & this->lee [0]; - auto pdee = & this->dee [0]; - auto puee = & this->uee [0]; - auto pleem= & this->leem[0]; - auto pueem= & this->ueem[0]; - auto pMooeeInv_shift_lc = &MooeeInv_shift_lc[0]; - auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0]; + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; + + static deviceVector d_MooeeInv_shift_lc(Ls); acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&d_MooeeInv_shift_lc[0],Ls*sizeof(Coeff_t)); + static deviceVector d_MooeeInv_shift_norm(Ls); acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&d_MooeeInv_shift_norm[0],Ls*sizeof(Coeff_t)); + auto pMooeeInv_shift_lc = &d_MooeeInv_shift_lc[0]; + auto pMooeeInv_shift_norm = &d_MooeeInv_shift_norm[0]; int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -333,11 +367,17 @@ void MobiusEOFAFermion::MooeeInvDag(const FermionField &psi_i, FermionFiel autoView(psi , psi_i, AcceleratorRead); autoView(chi , chi_i, AcceleratorWrite); - auto plee = & this->lee [0]; - auto pdee = & this->dee [0]; - auto puee = & this->uee [0]; - auto pleem= & this->leem[0]; - auto pueem= & this->ueem[0]; + static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ @@ -386,14 +426,28 @@ void MobiusEOFAFermion::MooeeInvDag_shift(const FermionField &psi_i, Fermi autoView(chi , chi_i, AcceleratorWrite); int Ls = this->Ls; + static deviceVector d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t)); + static deviceVector d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t)); + static deviceVector d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t)); + auto pm = this->pm; - auto plee = & this->lee [0]; - auto pdee = & this->dee [0]; - auto puee = & this->uee [0]; - auto pleem= & this->leem[0]; - auto pueem= & this->ueem[0]; - auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0]; - auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0]; + auto plee = & d_lee [0]; + auto pdee = & d_dee [0]; + auto puee = & d_uee [0]; + auto pleem = & d_leem[0]; + auto pueem = & d_ueem[0]; + + static deviceVector d_MooeeInvDag_shift_lc(Ls); + static deviceVector d_MooeeInvDag_shift_norm(Ls); + acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&d_MooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t)); + acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&d_MooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t)); + auto pMooeeInvDag_shift_lc = &d_MooeeInvDag_shift_lc[0]; + auto pMooeeInvDag_shift_norm = &d_MooeeInvDag_shift_norm[0]; + + // auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0]; + // auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0]; int nloop = grid->oSites()/Ls; accelerator_for(sss,nloop,Simd::Nsimd(),{ diff --git a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h index 9b9db178..70f06dfc 100644 --- a/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/MobiusEOFAFermionImplementation.h @@ -196,9 +196,9 @@ void MobiusEOFAFermion::M5D(const FermionField& psi, FermionField& chi) { int Ls = this->Ls; - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; - Vector lower(Ls,-1.0); lower[0] = this->mq1; + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; + std::vector lower(Ls,-1.0); lower[0] = this->mq1; // no shift term if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); } @@ -212,9 +212,9 @@ void MobiusEOFAFermion::M5Ddag(const FermionField& psi, FermionField& chi) { int Ls = this->Ls; - Vector diag(Ls,1.0); - Vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; - Vector lower(Ls,-1.0); lower[0] = this->mq1; + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1] = this->mq1; + std::vector lower(Ls,-1.0); lower[0] = this->mq1; // no shift term if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); } @@ -230,9 +230,9 @@ void MobiusEOFAFermion::Mooee(const FermionField& psi, FermionField& chi) int Ls = this->Ls; // coefficients of Mooee - Vector diag = this->bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = this->bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int s=0; scee[s]; lower[s] = -this->cee[s]; @@ -253,9 +253,9 @@ void MobiusEOFAFermion::MooeeDag(const FermionField& psi, FermionField& ch int Ls = this->Ls; // coefficients of MooeeDag - Vector diag = this->bee; - Vector upper(Ls); - Vector lower(Ls); + std::vector diag = this->bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int s=0; scee[s+1]; @@ -314,10 +314,10 @@ void MobiusEOFAFermion::SetCoefficientsPrecondShiftOps() // Tridiagonal solve for MooeeInvDag_shift_lc { Coeff_t m(0.0); - Vector d = Mooee_shift; - Vector u(Ls,0.0); - Vector y(Ls,0.0); - Vector q(Ls,0.0); + std::vector d = Mooee_shift; + std::vector u(Ls,0.0); + std::vector y(Ls,0.0); + std::vector q(Ls,0.0); if(pm == 1){ u[0] = 1.0; } else{ u[Ls-1] = 1.0; } diff --git a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h index bf23d99d..b596dc44 100644 --- a/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/NaiveStaggeredFermionImplementation.h @@ -48,8 +48,6 @@ NaiveStaggeredFermion::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRed StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd mass(_mass), - Lebesgue(_grid), - LebesgueEvenOdd(_cbgrid), Umu(&Fgrid), UmuEven(&Hgrid), UmuOdd(&Hgrid), @@ -268,7 +266,7 @@ void NaiveStaggeredFermion::Dhop(const FermionField &in, FermionField &out out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil, Lebesgue, Umu, in, out, dag); + DhopInternal(Stencil, Umu, in, out, dag); } template @@ -280,7 +278,7 @@ void NaiveStaggeredFermion::DhopOE(const FermionField &in, FermionField &o assert(in.Checkerboard() == Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag); + DhopInternal(StencilEven, UmuOdd, in, out, dag); } template @@ -292,7 +290,7 @@ void NaiveStaggeredFermion::DhopEO(const FermionField &in, FermionField &o assert(in.Checkerboard() == Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag); + DhopInternal(StencilOdd, UmuEven, in, out, dag); } template @@ -323,18 +321,18 @@ void NaiveStaggeredFermion::DhopDir(const FermionField &in, FermionField & template -void NaiveStaggeredFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, +void NaiveStaggeredFermion::DhopInternal(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) { if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,in,out,dag); + DhopInternalOverlappedComms(st,U,in,out,dag); else - DhopInternalSerialComms(st,lo,U,in,out,dag); + DhopInternalSerialComms(st,U,in,out,dag); } template -void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, +void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) @@ -356,7 +354,7 @@ void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, L { int interior=1; int exterior=0; - Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + Kernels::DhopNaive(st,U,in,out,dag,interior,exterior); } st.CommunicateComplete(requests); @@ -367,12 +365,12 @@ void NaiveStaggeredFermion::DhopInternalOverlappedComms(StencilImpl &st, L { int interior=0; int exterior=1; - Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + Kernels::DhopNaive(st,U,in,out,dag,interior,exterior); } } template -void NaiveStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, +void NaiveStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) @@ -385,7 +383,7 @@ void NaiveStaggeredFermion::DhopInternalSerialComms(StencilImpl &st, Lebes { int interior=1; int exterior=1; - Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); + Kernels::DhopNaive(st,U,in,out,dag,interior,exterior); } }; diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h index 2b6087bc..04337671 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h @@ -375,23 +375,6 @@ void StaggeredKernels::DhopSiteHandExt(StencilView &st, } } -/* -#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \ - template void StaggeredKernels::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \ - DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ - SiteSpinor *buf, int LLs, int sU, \ - const FermionFieldView &in, FermionFieldView &out, int dag); \ - \ - template void StaggeredKernels::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \ - DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ - SiteSpinor *buf, int LLs, int sU, \ - const FermionFieldView &in, FermionFieldView &out, int dag); \ - \ - template void StaggeredKernels::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \ - DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \ - SiteSpinor *buf, int LLs, int sU, \ - const FermionFieldView &in, FermionFieldView &out, int dag); \ -*/ #undef LOAD_CHI #undef HAND_DECLARATIONS diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h index a39b529f..05dbf3b2 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h @@ -256,7 +256,7 @@ void StaggeredKernels::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie }); template -void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, +void StaggeredKernels::DhopImproved(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag, int interior,int exterior) { @@ -294,7 +294,7 @@ void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, assert(0 && " Kernel optimisation case not covered "); } template -void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, +void StaggeredKernels::DhopNaive(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag, int interior,int exterior) { diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 95af4c38..2ad48926 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -58,15 +58,9 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, Umu(_FourDimGrid), UmuEven(_FourDimRedBlackGrid), UmuOdd (_FourDimRedBlackGrid), - Lebesgue(_FourDimGrid), - LebesgueEvenOdd(_FourDimRedBlackGrid), _tmp(&FiveDimRedBlackGrid), Dirichlet(0) { - Stencil.lo = &Lebesgue; - StencilEven.lo = &LebesgueEvenOdd; - StencilOdd.lo = &LebesgueEvenOdd; - // some assertions assert(FiveDimGrid._ndimension==5); assert(FourDimGrid._ndimension==4); @@ -305,19 +299,19 @@ void WilsonFermion5D::DhopDerivOE(GaugeField &mat, } template -void WilsonFermion5D::DhopInternal(StencilImpl & st, LebesgueOrder &lo, +void WilsonFermion5D::DhopInternal(StencilImpl & st, DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) { if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,in,out,dag); + DhopInternalOverlappedComms(st,U,in,out,dag); else - DhopInternalSerialComms(st,lo,U,in,out,dag); + DhopInternalSerialComms(st,U,in,out,dag); } template -void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo, +void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) { @@ -331,10 +325,12 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg // Start comms // Gather intranode and extra node differentiated?? ///////////////////////////// { + std::cout << " WilsonFermion5D gather " < > requests; auto id=traceStart("Communicate overlapped"); st.CommunicateBegin(requests); @@ -343,6 +339,7 @@ void WilsonFermion5D::DhopInternalOverlappedComms(StencilImpl & st, Lebesg // Overlap with comms ///////////////////////////// { + std::cout << " WilsonFermion5D Comms merge " <::DhopInternalOverlappedComms(StencilImpl & st, Lebesg ///////////////////////////// // do the compute interior ///////////////////////////// + std::cout << " WilsonFermion5D Interior " <::DhopInternalOverlappedComms(StencilImpl & st, Lebesg ///////////////////////////// // Complete comms ///////////////////////////// + std::cout << " WilsonFermion5D Comms Complete " <::DhopInternalOverlappedComms(StencilImpl & st, Lebesg // do the compute exterior ///////////////////////////// { + std::cout << " WilsonFermion5D Comms Merge " <::DhopInternalOverlappedComms(StencilImpl & st, Lebesg GRID_TRACE("DhopExterior"); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); } + std::cout << " WilsonFermion5D Done " < -void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, +void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, DoubledGaugeField & U, const FermionField &in, FermionField &out,int dag) @@ -395,11 +397,13 @@ void WilsonFermion5D::DhopInternalSerialComms(StencilImpl & st, LebesgueOr int LLs = in.Grid()->_rdimensions[0]; + std::cout << " WilsonFermion5D Halo exch " <::DhopInternalSerialComms(StencilImpl & st, LebesgueOr GRID_TRACE("Dhop"); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out); } + std::cout << " WilsonFermion5D Done " <::DhopOE(const FermionField &in, FermionField &out,int assert(in.Checkerboard()==Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag); + DhopInternal(StencilEven,UmuOdd,in,out,dag); } template void WilsonFermion5D::DhopEO(const FermionField &in, FermionField &out,int dag) @@ -431,7 +436,7 @@ void WilsonFermion5D::DhopEO(const FermionField &in, FermionField &out,int assert(in.Checkerboard()==Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag); + DhopInternal(StencilOdd,UmuEven,in,out,dag); } template void WilsonFermion5D::Dhop(const FermionField &in, FermionField &out,int dag) @@ -441,7 +446,7 @@ void WilsonFermion5D::Dhop(const FermionField &in, FermionField &out,int d out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil,Lebesgue,Umu,in,out,dag); + DhopInternal(Stencil,Umu,in,out,dag); } template void WilsonFermion5D::DW(const FermionField &in, FermionField &out,int dag) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 1a262533..8c58f692 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -52,17 +52,12 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, StencilEven(&Hgrid, npoint, Even, directions,displacements,p), // source is Even StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p), // source is Odd mass(_mass), - Lebesgue(_grid), - LebesgueEvenOdd(_cbgrid), Umu(&Fgrid), UmuEven(&Hgrid), UmuOdd(&Hgrid), _tmp(&Hgrid), anisotropyCoeff(anis) { - Stencil.lo = &Lebesgue; - StencilEven.lo = &LebesgueEvenOdd; - StencilOdd.lo = &LebesgueEvenOdd; // Allocate the required comms buffer ImportGauge(_Umu); if (anisotropyCoeff.isAnisotropic){ @@ -314,7 +309,7 @@ void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int da out.Checkerboard() = in.Checkerboard(); - DhopInternal(Stencil, Lebesgue, Umu, in, out, dag); + DhopInternal(Stencil, Umu, in, out, dag); } template @@ -326,7 +321,7 @@ void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int assert(in.Checkerboard() == Even); out.Checkerboard() = Odd; - DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag); + DhopInternal(StencilEven, UmuOdd, in, out, dag); } template @@ -338,7 +333,7 @@ void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int d assert(in.Checkerboard() == Odd); out.Checkerboard() = Even; - DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag); + DhopInternal(StencilOdd, UmuEven, in, out, dag); } template @@ -391,21 +386,21 @@ void WilsonFermion::DhopDirCalc(const FermionField &in, FermionField &out, }; template -void WilsonFermion::DhopInternal(StencilImpl &st, LebesgueOrder &lo, +void WilsonFermion::DhopInternal(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) { #ifdef GRID_OMP if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) - DhopInternalOverlappedComms(st,lo,U,in,out,dag); + DhopInternalOverlappedComms(st,U,in,out,dag); else #endif - DhopInternalSerial(st,lo,U,in,out,dag); + DhopInternalSerial(st,U,in,out,dag); } template -void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, +void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag) @@ -474,10 +469,10 @@ void WilsonFermion::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO template -void WilsonFermion::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, - DoubledGaugeField &U, - const FermionField &in, - FermionField &out, int dag) +void WilsonFermion::DhopInternalSerial(StencilImpl &st, + DoubledGaugeField &U, + const FermionField &in, + FermionField &out, int dag) { GRID_TRACE("DhopSerial"); assert((dag == DaggerNo) || (dag == DaggerYes)); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h index e025ba41..2633c127 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h @@ -40,11 +40,11 @@ Author: paboyle /// Switch off the 5d vectorised code optimisations #undef DWFVEC5D -static Vector signsF; +static std::vector signsF; template - int setupSigns(Vector& signs ){ - Vector bother(2); + int setupSigns(std::vector& signs ){ + std::vector bother(2); signs = bother; vrsign(signs[0]); visign(signs[1]); @@ -364,7 +364,7 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, Doubled #include -static Vector signsD; +static std::vector signsD; static int signInitD = setupSigns(signsD); #define MAYBEPERM(A,perm) if (perm) { A ; } diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 90defc54..43662b9c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -434,7 +434,7 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S #define ASM_CALL(A) \ thread_for( sss, Nsite, { \ - int ss = st.lo->Reorder(sss); \ + int ss = sss; /*st.lo->Reorder(sss);*/ \ int sU = ss; \ int sF = ss*Ls; \ WilsonKernels::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ diff --git a/Grid/qcd/representations/adjoint.h b/Grid/qcd/representations/adjoint.h index ee54b465..8d7e9e3c 100644 --- a/Grid/qcd/representations/adjoint.h +++ b/Grid/qcd/representations/adjoint.h @@ -40,7 +40,7 @@ public: U = Zero(); LatticeColourMatrix tmp(Uin.Grid()); - Vector::Matrix> ta(Dimension); + std::vector::Matrix> ta(Dimension); // Debug lines // LatticeMatrix uno(Uin.Grid()); diff --git a/Grid/qcd/representations/two_index.h b/Grid/qcd/representations/two_index.h index 24d6d7cb..c9c1db94 100644 --- a/Grid/qcd/representations/two_index.h +++ b/Grid/qcd/representations/two_index.h @@ -43,7 +43,7 @@ public: U = Zero(); LatticeColourMatrix tmp(Uin.Grid()); - Vector::Matrix> eij(Dimension); + std::vector::Matrix> eij(Dimension); for (int a = 0; a < Dimension; a++) GaugeGroupTwoIndex::base(a, eij[a]); diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index b63d8571..a81ebe6c 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -158,12 +158,12 @@ void A2Autils::MesonField(TensorType &mat, int MFrvol = rd*Lblock*Rblock*Nmom; int MFlvol = ld*Lblock*Rblock*Nmom; - Vector lvSum(MFrvol); + std::vector lvSum(MFrvol); thread_for( r, MFrvol,{ lvSum[r] = Zero(); }); - Vector lsSum(MFlvol); + std::vector lsSum(MFlvol); thread_for(r,MFlvol,{ lsSum[r]=scalar_type(0.0); }); @@ -346,12 +346,12 @@ void A2Autils::PionFieldXX(Eigen::Tensor &mat, int MFrvol = rd*Lblock*Rblock; int MFlvol = ld*Lblock*Rblock; - Vector lvSum(MFrvol); + std::vector lvSum(MFrvol); thread_for(r,MFrvol,{ lvSum[r] = Zero(); }); - Vector lsSum(MFlvol); + std::vector lsSum(MFlvol); thread_for(r,MFlvol,{ lsSum[r]=scalar_type(0.0); }); @@ -493,12 +493,12 @@ void A2Autils::PionFieldWVmom(Eigen::Tensor &mat, int MFrvol = rd*Lblock*Rblock*Nmom; int MFlvol = ld*Lblock*Rblock*Nmom; - Vector lvSum(MFrvol); + std::vector lvSum(MFrvol); thread_for(r,MFrvol,{ lvSum[r] = Zero(); }); - Vector lsSum(MFlvol); + std::vector lsSum(MFlvol); thread_for(r,MFlvol,{ lsSum[r]=scalar_type(0.0); }); @@ -700,13 +700,13 @@ void A2Autils::AslashField(TensorType &mat, int MFrvol = rd*Lblock*Rblock*Nem; int MFlvol = ld*Lblock*Rblock*Nem; - Vector lvSum(MFrvol); + std::vector lvSum(MFrvol); thread_for(r,MFrvol, { lvSum[r] = Zero(); }); - Vector lsSum(MFlvol); + std::vector lsSum(MFlvol); thread_for(r,MFlvol, { lsSum[r] = scalar_type(0.0); diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 9d9cb508..9a1d312b 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -971,7 +971,9 @@ void BaryonUtils::BaryonGamma3pt( autoView( vq_ti , q_ti , AcceleratorRead); autoView( vq_tf , q_tf , AcceleratorRead); - Vector my_Dq_spec{Dq_spec1,Dq_spec2}; + deviceVector my_Dq_spec(2); + acceleratorPut(my_Dq_spec[0],Dq_spec1); + acceleratorPut(my_Dq_spec[1],Dq_spec2); mobj * Dq_spec_p = &my_Dq_spec[0]; if (group == 1) { @@ -1300,7 +1302,8 @@ void BaryonUtils::SigmaToNucleonEye(const PropagatorField &qq_loop, autoView( vd_tf , qd_tf , AcceleratorRead); autoView( vs_ti , qs_ti , AcceleratorRead); - Vector my_Dq_spec{Du_spec}; + deviceVector my_Dq_spec(1); + acceleratorPut(my_Dq_spec[0],Du_spec); mobj * Dq_spec_p = &my_Dq_spec[0]; if(op == "Q1"){ @@ -1353,7 +1356,8 @@ void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, autoView( vd_tf , qd_tf , AcceleratorRead ); autoView( vs_ti , qs_ti , AcceleratorRead ); - Vector my_Dq_spec{Du_spec}; + deviceVector my_Dq_spec(1); + acceleratorPut(my_Dq_spec[0],Du_spec); mobj * Dq_spec_p = &my_Dq_spec[0]; if(op == "Q1"){ @@ -1544,7 +1548,9 @@ void BaryonUtils::XiToSigmaEye(const PropagatorField &qq_loop, autoView( vd_tf , qd_tf , AcceleratorRead); autoView( vs_ti , qs_ti , AcceleratorRead); - Vector my_Dq_spec{Dd_spec,Ds_spec}; + deviceVector my_Dq_spec(2); + acceleratorPut(my_Dq_spec[0],Dd_spec); + acceleratorPut(my_Dq_spec[0],Ds_spec); mobj * Dq_spec_p = &my_Dq_spec[0]; if(op == "Q1"){ diff --git a/Grid/qcd/utils/SUnAdjoint.h b/Grid/qcd/utils/SUnAdjoint.h index 84c7278c..cfc48bbf 100644 --- a/Grid/qcd/utils/SUnAdjoint.h +++ b/Grid/qcd/utils/SUnAdjoint.h @@ -62,7 +62,7 @@ public: // returns i(T_Adj)^index necessary for the projectors // see definitions above iAdjTa = Zero(); - Vector > ta(ncolour * ncolour - 1); + iSUnMatrix ta[ncolour * ncolour - 1]; iSUnMatrix tmp; // FIXME not very efficient to get all the generators everytime diff --git a/Grid/stencil/GeneralLocalStencil.h b/Grid/stencil/GeneralLocalStencil.h index b6848977..66d25bc4 100644 --- a/Grid/stencil/GeneralLocalStencil.h +++ b/Grid/stencil/GeneralLocalStencil.h @@ -72,7 +72,7 @@ public: } // Resident in managed memory - Vector _entries; + deviceVector _entries; GeneralLocalStencil(GridBase *grid, const std::vector &shifts) { @@ -141,7 +141,7 @@ public: //////////////////////////////////////////////// // Store in look up table //////////////////////////////////////////////// - this->_entries[lex] = SE; + acceleratorPut(this->_entries[lex],SE); } }); } diff --git a/Grid/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h index dabd70a6..eca9cd3c 100644 --- a/Grid/stencil/SimpleCompressor.h +++ b/Grid/stencil/SimpleCompressor.h @@ -19,7 +19,7 @@ public: static int PartialCompressionFactor(GridBase *grid) {return 1;}; // Decompress is after merge so ok template - static void Gather_plane_simple (commVector >& table, + static void Gather_plane_simple (deviceVector >& table, const Lattice &rhs, cobj *buffer, compressor &compress, @@ -35,7 +35,7 @@ public: rhs_v.ViewClose(); } template - static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, + static void Gather_plane_exchange(deviceVector >& table,const Lattice &rhs, std::vector pointers,int dimension,int plane,int cbmask, compressor &compress,int type,int partial) { @@ -83,25 +83,6 @@ public: // Wilson compressor will add alternate policies for Dirichlet // and possibly partial Dirichlet for DWF //////////////////////////////////// -/* -class FaceGatherDirichlet -{ - // If it's dirichlet we don't assemble comms buffers - // - // Rely on zeroes in gauge field to drive the correct result - // NAN propgagation: field will locally wrap, so fermion should NOT contain NAN and just permute - template - static void Gather_plane_simple (commVector >& table,const Lattice &rhs,cobj *buffer,compressor &compress, int off,int so){}; - template - static void Gather_plane_exchange(commVector >& table,const Lattice &rhs, - Vector pointers,int dimension,int plane,int cbmask, - compressor &compress,int type) {} - template - static void Merge(decompressor decompress,Merge &mm) { } - template - static void Decompress(decompressor decompress,Decompression &dd) {} -}; -*/ template class SimpleCompressorGather : public FaceGather { diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 80acb4ae..0918df8e 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -31,7 +31,6 @@ #define STENCIL_MAX (16) #include // subdir aggregate -#include // subdir aggregate #include ////////////////////////////////////////////////////////////////////////////////////////// @@ -256,7 +255,6 @@ protected: GridBase * _grid; public: GridBase *Grid(void) const { return _grid; } - LebesgueOrder *lo; //////////////////////////////////////////////////////////////////////// // Needed to conveniently communicate gparity parameters into GPU memory @@ -273,11 +271,11 @@ public: int face_table_computed; int partialDirichlet; int fullDirichlet; - std::vector > > face_table ; - Vector surface_list; + std::vector > > face_table ; + deviceVector surface_list; - stencilVector _entries; // Resident in managed memory - commVector _entries_device; // Resident in device memory + std::vector _entries; // Resident in host memory + deviceVector _entries_device; // Resident in device memory std::vector Packets; std::vector Mergers; std::vector MergersSHM; @@ -370,7 +368,6 @@ public: // accelerator_barrier(); // All kernels should ALREADY be complete // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. -#ifdef ACCELERATOR_AWARE_MPI for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, Packets[i].send_buf, @@ -379,23 +376,6 @@ public: Packets[i].from_rank,Packets[i].do_recv, Packets[i].xbytes,Packets[i].rbytes,i); } -#else -#warning "Using COPY VIA HOST BUFFERS IN STENCIL" - for(int i=0;iHostBufferMalloc(Packets[i].xbytes); - Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); - if ( Packets[i].do_send ) { - acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); - } - _grid->StencilSendToRecvFromBegin(MpiReqs, - Packets[i].host_send_buf, - Packets[i].to_rank,Packets[i].do_send, - Packets[i].host_recv_buf, - Packets[i].from_rank,Packets[i].do_recv, - Packets[i].xbytes,Packets[i].rbytes,i); - } -#endif // Get comms started then run checksums // Having this PRIOR to the dslash seems to make Sunspot work... (!) for(int i=0;iStencilBarrier(); -#ifndef ACCELERATOR_AWARE_MPI -#warning "Using COPY VIA HOST BUFFERS IN STENCIL" - for(int i=0;iHostBufferFreeAll(); -#endif // run any checksums for(int i=0;i_npoints;point++){ this->same_node[point] = this->SameNode(point); } - + int32_t surface_list_size=0; for(int site = 0 ;site< vol4;site++){ int local = 1; for(int point=0;point_npoints;point++){ @@ -678,11 +649,28 @@ public: } if(local == 0) { for(int s=0;s_npoints;point++){ + if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){ + local = 0; + } + } + if(local == 0) { + for(int s=0;s 0); - } -#endif - if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ - arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); - GridCmdOptionIntVector(arg,LebesgueOrder::Block); - } if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){ GridLogTimestamp(0); } else { diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 2b1f6261..c42136b6 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -644,11 +644,6 @@ int main (int argc, char ** argv) Grid_init(&argc,&argv); CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); -#ifdef KNL - LebesgueOrder::Block = std::vector({8,2,2,2}); -#else - LebesgueOrder::Block = std::vector({2,2,2,2}); -#endif Benchmark::Decomposition(); int do_su4=1; diff --git a/benchmarks/Benchmark_memory_asynch.cc b/benchmarks/Benchmark_memory_asynch.cc index 97825144..4c27fc2c 100644 --- a/benchmarks/Benchmark_memory_asynch.cc +++ b/benchmarks/Benchmark_memory_asynch.cc @@ -70,7 +70,7 @@ int main (int argc, char ** argv) pRNG.SeedFixedIntegers(std::vector({56,17,89,101})); std::vector stop(threads); - Vector sum(threads); + std::vector sum(threads); std::vector x(threads,&Grid); for(int t=0;t diag = Dw.bs; - Vector upper= Dw.cs; - Vector lower= Dw.cs; + std::vector diag = Dw.bs; + std::vector upper= Dw.cs; + std::vector lower= Dw.cs; upper[Ls-1]=-Dw.mass_minus*upper[Ls-1]; lower[0] =-Dw.mass_plus*lower[0]; diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc index 870cb6ec..d2bbf769 100644 --- a/benchmarks/Benchmark_usqcd.cc +++ b/benchmarks/Benchmark_usqcd.cc @@ -861,7 +861,7 @@ int main (int argc, char ** argv) } CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); - LebesgueOrder::Block = std::vector({2,2,2,2}); + // LebesgueOrder::Block = std::vector({2,2,2,2}); Benchmark::Decomposition(); diff --git a/configure.ac b/configure.ac index 8e8d67af..652944f9 100644 --- a/configure.ac +++ b/configure.ac @@ -225,18 +225,6 @@ case ${ac_SFW_FP16} in AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);; esac -############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons -AC_ARG_ENABLE([accelerator-aware-mpi], - [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])], - [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes]) - -case ${ac_ACCELERATOR_AWARE_MPI} in - yes) - AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host]) - AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);; - *);; -esac - ############### SYCL/CUDA/HIP/none AC_ARG_ENABLE([accelerator], @@ -664,16 +652,6 @@ case ${ac_SHM_FAST_PATH} in *) ;; esac -############### communication type selection -AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes]) - -case ${ac_COMMS_THREADS} in - yes) - AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] ) - ;; - *) ;; -esac - ############### communication type selection AC_ARG_ENABLE([comms],[AS_HELP_STRING([--enable-comms=none|mpi|mpi-auto],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none]) diff --git a/systems/Aurora/benchmarks/bench1.pbs b/systems/Aurora/benchmarks/bench1.pbs index b53327f0..a202b587 100644 --- a/systems/Aurora/benchmarks/bench1.pbs +++ b/systems/Aurora/benchmarks/bench1.pbs @@ -1,6 +1,6 @@ #!/bin/bash -#PBS -q debug +#PBS -q EarlyAppAccess #PBS -l select=1 #PBS -l walltime=00:20:00 #PBS -A LatticeQCD_aesp_CNDA @@ -44,7 +44,7 @@ CMD="mpiexec -np 1 -ppn 1 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \ --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 " -#$CMD | tee 1tile.dwf +$CMD | tee 1tile.dwf CMD="mpiexec -np 12 -ppn 12 -envall \ ./gpu_tile_compact.sh \ diff --git a/systems/Aurora/benchmarks/bench2.pbs b/systems/Aurora/benchmarks/bench2.pbs index ea469cda..ce477319 100644 --- a/systems/Aurora/benchmarks/bench2.pbs +++ b/systems/Aurora/benchmarks/bench2.pbs @@ -1,6 +1,6 @@ #!/bin/bash -#PBS -q workq +#PBS -q EarlyAppAccess #PBS -l select=2 #PBS -l walltime=00:20:00 #PBS -A LatticeQCD_aesp_CNDA @@ -43,13 +43,13 @@ $CMD | tee 2node.comms CMD="mpiexec -np 24 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 " $CMD | tee 2node.32.32.64.48.dwf CMD="mpiexec -np 24 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \ - --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 " $CMD | tee 2node.64.64.64.96.dwf diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index 7abe667f..8ccba356 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -1,40 +1,12 @@ +module load oneapi/release/2023.12.15.001 +#module load intel_compute_runtime/release/821.35 source ~/spack/share/spack/setup-env.sh spack load c-lime +spack load openssl export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' ` -#spack load libefence -#export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' ` -#export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH -#spack load gperftools -export TCMALLOC=/home/paboyle/gperftools/install -export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH -export INTELGT_AUTO_ATTACH_DISABLE=1 - -#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 -#module load oneapi/release/2023.12.15.001 -#module use /soft/modulefiles -#module load intel_compute_runtime/release/agama-devel-682.22 - -#export FI_CXI_DEFAULT_CQ_SIZE=131072 -#export FI_CXI_CQ_FILL_PERCENT=20 -#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" -#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" - -# -# -ftarget-register-alloc-mode=pvc:default -# -ftarget-register-alloc-mode=pvc:small -# -ftarget-register-alloc-mode=pvc:large -# -ftarget-register-alloc-mode=pvc:auto -#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 - export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 export http_proxy=http://proxy.alcf.anl.gov:3128 export https_proxy=http://proxy.alcf.anl.gov:3128 git config --global http.proxy http://proxy.alcf.anl.gov:3128 - -#source ~/spack/share/spack/setup-env.sh -#spack load gperftools -#export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' ` -#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH - export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" diff --git a/systems/Aurora/tests/reproBigJob.pbs b/systems/Aurora/tests/reproBigJob.pbs index 205fefce..721b4707 100644 --- a/systems/Aurora/tests/reproBigJob.pbs +++ b/systems/Aurora/tests/reproBigJob.pbs @@ -1,6 +1,6 @@ #!/bin/bash -#PBS -l select=16 +#PBS -l select=32 #PBS -q EarlyAppAccess #PBS -A LatticeQCD_aesp_CNDA #PBS -l walltime=02:00:00 @@ -15,7 +15,7 @@ # 56 cores / 6 threads ~9 export OMP_NUM_THREADS=6 -export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 @@ -24,14 +24,14 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 -export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" export GRID_PRINT_ENTIRE_LOG=0 -export GRID_CHECKSUM_RECV_BUF=0 -export GRID_CHECKSUM_SEND_BUF=0 +export GRID_CHECKSUM_RECV_BUF=1 +export GRID_CHECKSUM_SEND_BUF=1 export MPICH_OFI_NIC_POLICY=GPU @@ -51,10 +51,10 @@ cd $DIR cp $PBS_NODEFILE nodefile -CMD="mpiexec -np 192 -ppn 12 -envall --hostfile nodefile \ +CMD="mpiexec -np 384 -ppn 12 -envall --hostfile nodefile \ ../gpu_tile_compact.sh \ - ../Test_dwf_mixedcg_prec --mpi 4.4.4.3 --grid 128.128.128.96 \ - --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap" + ../Test_dwf_mixedcg_prec --mpi 4.4.4.6 --grid 128.128.128.96 \ + --shm-mpi 1 --comms-overlap --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --debug-signals" echo $CMD > command-line env > environment diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc index 212b1a35..e60d3555 100644 --- a/tests/core/Test_fft.cc +++ b/tests/core/Test_fft.cc @@ -88,6 +88,7 @@ int main (int argc, char ** argv) Ctilde=C; std::cout<<" Benchmarking FFT of LatticeComplex "< testAlgebra; @@ -148,11 +149,12 @@ void checkSigma(const GparityFlavour::Algebra a, GridSerialRNG &rng) test(m*g, m*testg); std::cout << std::endl; } +#endif int main(int argc, char *argv[]) { Grid_init(&argc,&argv); - +#ifdef ENABLE_GPARITY Coordinate latt_size = GridDefaultLatt(); Coordinate simd_layout = GridDefaultSimd(4,vComplex::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); @@ -170,7 +172,7 @@ int main(int argc, char *argv[]) checkSigma(i, sRNG); } std::cout << GridLogMessage << std::endl; - +#endif Grid_finalize(); return EXIT_SUCCESS; diff --git a/tests/core/Test_gpwilson_even_odd.cc b/tests/core/Test_gpwilson_even_odd.cc index c8587435..0f3c8aad 100644 --- a/tests/core/Test_gpwilson_even_odd.cc +++ b/tests/core/Test_gpwilson_even_odd.cc @@ -35,7 +35,7 @@ using namespace Grid; int main (int argc, char ** argv) { Grid_init(&argc,&argv); - +#ifdef ENABLE_GPARITY Coordinate latt_size = GridDefaultLatt(); Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); @@ -216,6 +216,6 @@ int main (int argc, char ** argv) std::cout<oSites(),1,{ - assert(B[v]==A_v[ss]()()().getlane(0)); + // assert(B[v]==A_v[ss]()()().getlane(0)); }); // std::cout << "["< inline void sliceSumCPU(const Grid::Lattice &Data,std int ld=grid->_ldimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim]; - Vector lvSum(rd); // will locally sum vectors first - Vector lsSum(ld,Zero()); // sum across these down to scalars + std::vector lvSum(rd); // will locally sum vectors first + std::vector lsSum(ld,Zero()); // sum across these down to scalars ExtractBuffer extracted(Nsimd); // splitting the SIMD result.resize(fd); // And then global sum to return the same vector to every node diff --git a/tests/sp2n/Test_2as_base.cc b/tests/sp2n/Test_2as_base.cc index 62e86609..3aeccae0 100644 --- a/tests/sp2n/Test_2as_base.cc +++ b/tests/sp2n/Test_2as_base.cc @@ -87,8 +87,8 @@ static void run_generators_checks() { typedef typename Sp_TwoIndex::template iGroupMatrix Matrix; int sum = 0; int sum_im = 0; - Vector ta_fund(this_algebra_dim); - Vector eij(this_irrep_dim); + std::vector ta_fund(this_algebra_dim); + std::vector eij(this_irrep_dim); Matrix tmp_l; Matrix tmp_r; for (int n = 0; n < this_algebra_dim; n++)