From 0174f5f742782d1b43e49213bd9d729f7094962e Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Thu, 11 Jun 2020 16:50:43 +0100 Subject: [PATCH 1/8] look for librt when using shm=shmopen --- configure.ac | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/configure.ac b/configure.ac index 6cacb813..85f9f50e 100644 --- a/configure.ac +++ b/configure.ac @@ -428,6 +428,14 @@ case ${ac_SHM} in shmopen) AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] ) + CXXFLAGS_CPY=$CXXFLAGS + LDFLAGS_CPY=$LDFLAGS + CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS" + LDFLAGS="$AM_LDFLAGS $LDFLAGS" + AC_SEARCH_LIBS([shm_unlink], [rt], [], + [AC_MSG_ERROR("no library found for shm_unlink")]) + CXXFLAGS=$CXXFLAGS_CPY + LDFLAGS=$LDFLAGS_CPY ;; shmget) From edf17708a813d4ee2c4765dc24bd7e4943c3e784 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 18 Jun 2020 22:41:06 -0400 Subject: [PATCH 2/8] Range improvement --- tests/hmc/Test_multishift_sqrt.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/hmc/Test_multishift_sqrt.cc b/tests/hmc/Test_multishift_sqrt.cc index 5a6d8ba9..31697c12 100644 --- a/tests/hmc/Test_multishift_sqrt.cc +++ b/tests/hmc/Test_multishift_sqrt.cc @@ -104,7 +104,7 @@ int main (int argc, char ** argv) GridDefaultMpi()); double lo=0.001; - double hi=1.0; + double hi=20.0; int precision=64; int degree=10; AlgRemez remez(lo,hi,precision); From 1aa988b2af51e07862cbabd4aa3302a7ffef7f7e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 19 Jun 2020 01:21:14 -0400 Subject: [PATCH 3/8] Comms overlap fix UVM case --- .../fermion/implementation/WilsonFermionImplementation.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index fd81d322..f647bef8 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -67,7 +67,12 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, diag_mass = 4.0 + mass; } - + int vol4; + vol4=Fgrid.oSites(); + Stencil.BuildSurfaceList(1,vol4); + vol4=Hgrid.oSites(); + StencilEven.BuildSurfaceList(1,vol4); + StencilOdd.BuildSurfaceList(1,vol4); } template From 66005929af0eba50e811f2e0a96a3262dd665753 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 19 Jun 2020 12:50:54 -0400 Subject: [PATCH 4/8] Set up the cache size on all ranks --- Grid/threads/Accelerator.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index d049fd2f..ca46f119 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -37,9 +37,10 @@ void acceleratorInit(void) #define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); #define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d"); cudaGetDeviceProperties(&gpu_props[i], i); + cudaDeviceProp prop; + prop = gpu_props[i]; + totalDeviceMem = prop.totalGlobalMem; if ( world_rank == 0) { - cudaDeviceProp prop; - prop = gpu_props[i]; printf("AcceleratorCudaInit: ========================\n"); printf("AcceleratorCudaInit: Device Number : %d\n", i); printf("AcceleratorCudaInit: ========================\n"); @@ -49,7 +50,6 @@ void acceleratorInit(void) GPU_PROP(managedMemory); GPU_PROP(isMultiGpuBoard); GPU_PROP(warpSize); - totalDeviceMem = prop.totalGlobalMem; // GPU_PROP(unifiedAddressing); // GPU_PROP(l2CacheSize); // GPU_PROP(singleToDoublePrecisionPerfRatio); From 11bc1aeadcf8f43c4e52af52e0fa8c1e7188d835 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 19 Jun 2020 14:30:35 -0400 Subject: [PATCH 5/8] TThread count defaultt to fastest --- Grid/threads/Accelerator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index ca46f119..2c4ad9df 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -1,7 +1,7 @@ #include NAMESPACE_BEGIN(Grid); -uint32_t accelerator_threads=8; +uint32_t accelerator_threads=2; uint32_t acceleratorThreads(void) {return accelerator_threads;}; void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; From b949cf6b12e7a88894344f7284c242aa3eb9eb4b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 19 Jun 2020 17:13:27 -0400 Subject: [PATCH 6/8] PeekLocal needs a view to keep thread safe. ALLOCATION_CACHEE reenable --- Grid/algorithms/FFT.h | 9 ++-- Grid/allocator/MemoryManager.cc | 24 ++++++--- Grid/allocator/MemoryManager.h | 2 +- Grid/lattice/Lattice_peekpoke.h | 24 ++++----- Grid/lattice/Lattice_transfer.h | 32 ++++++----- Grid/lattice/Lattice_view.h | 2 + Grid/qcd/action/fermion/DomainWallVec5dImpl.h | 15 +++--- Grid/qcd/action/fermion/StaggeredVec5dImpl.h | 15 +----- .../WilsonCloverFermionImplementation.h | 53 ++++++++++--------- .../WilsonFermion5DImplementation.h | 34 +++++++----- Grid/util/Init.cc | 11 +++- 11 files changed, 125 insertions(+), 96 deletions(-) diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index 550186fc..765305d7 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -230,14 +230,15 @@ public: result = source; int pc = processor_coor[dim]; for(int p=0;plSites(),{ Coordinate cbuf(Nd); sobj s; sgrid->LocalIndexToLocalCoor(idx,cbuf); - peekLocalSite(s,result,cbuf); - cbuf[dim]+=((pc+p) % processors[dim])*L; - // cbuf[dim]+=p*L; - pokeLocalSite(s,pgbuf,cbuf); + peekLocalSite(s,r_v,cbuf); + acbuf[dim]+=((pc+p) % processors[dim])*L; + pokeLocalSite(s,p_v,cbuf); }); if (p != processors[dim] - 1) { result = Cshift(result,dim,L); diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc index 17850333..6d638b60 100644 --- a/Grid/allocator/MemoryManager.cc +++ b/Grid/allocator/MemoryManager.cc @@ -86,23 +86,33 @@ void MemoryManager::Init(void) Ncache[AccSmall]=Nc; } } - std::cout << "MemoryManager::Init() setting up"< &l,const Coordinate &site){ ////////////////////////////////////////////////////////// // Peek a scalar object from the SIMD array ////////////////////////////////////////////////////////// +// Must be CPU read view template -inline void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site) +inline void peekLocalSite(sobj &s,const LatticeView &l,Coordinate &site) { - GridBase *grid = l.Grid(); - + GridBase *grid = l.getGrid(); + assert(l.mode==CpuRead); typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nsimd = grid->Nsimd(); - assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); + assert( l.Checkerboard()== grid->CheckerBoard(site)); assert( sizeof(sobj)*Nsimd == sizeof(vobj)); static const int words=sizeof(vobj)/sizeof(vector_type); @@ -172,8 +173,7 @@ inline void peekLocalSite(sobj &s,const Lattice &l,Coordinate &site) idx= grid->iIndex(site); odx= grid->oIndex(site); - autoView( l_v , l, CpuRead); - scalar_type * vp = (scalar_type *)&l_v[odx]; + scalar_type * vp = (scalar_type *)&l[odx]; scalar_type * pt = (scalar_type *)&s; for(int w=0;w &l,Coordinate &site) return; }; - +// Must be CPU write view template -inline void pokeLocalSite(const sobj &s,Lattice &l,Coordinate &site) +inline void pokeLocalSite(const sobj &s,LatticeView &l,Coordinate &site) { - GridBase *grid=l.Grid(); + GridBase *grid=l.getGrid(); + assert(l.mode==CpuWrite); typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nsimd = grid->Nsimd(); - assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); + assert( l.Checkerboard()== grid->CheckerBoard(site)); assert( sizeof(sobj)*Nsimd == sizeof(vobj)); static const int words=sizeof(vobj)/sizeof(vector_type); @@ -201,8 +202,7 @@ inline void pokeLocalSite(const sobj &s,Lattice &l,Coordinate &site) idx= grid->iIndex(site); odx= grid->oIndex(site); - autoView( l_v , l, CpuWrite); - scalar_type * vp = (scalar_type *)&l_v[odx]; + scalar_type * vp = (scalar_type *)&l[odx]; scalar_type * pt = (scalar_type *)&s; for(int w=0;w &in,Lattice &out) assert(ig->lSites() == og->lSites()); } + autoView(in_v,in,CpuRead); + autoView(out_v,out,CpuWrite); thread_for(idx, ig->lSites(),{ sobj s; ssobj ss; Coordinate lcoor(ni); ig->LocalIndexToLocalCoor(idx,lcoor); - peekLocalSite(s,in,lcoor); + peekLocalSite(s,in_v,lcoor); ss=s; - pokeLocalSite(ss,out,lcoor); + pokeLocalSite(ss,out_v,lcoor); }); } @@ -588,8 +590,6 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro for(int w=0;w &lowDim,Lattice & higherDim,int slice } // the above should guarantee that the operations are local + autoView(lowDimv,lowDim,CpuRead); + autoView(higherDimv,higherDim,CpuWrite); thread_for(idx,lg->lSites(),{ sobj s; Coordinate lcoor(nl); @@ -632,8 +634,8 @@ void InsertSlice(const Lattice &lowDim,Lattice & higherDim,int slice hcoor[d]=lcoor[ddl++]; } } - peekLocalSite(s,lowDim,lcoor); - pokeLocalSite(s,higherDim,hcoor); + peekLocalSite(s,lowDimv,lcoor); + pokeLocalSite(s,higherDimv,hcoor); }); } @@ -661,6 +663,8 @@ void ExtractSlice(Lattice &lowDim,const Lattice & higherDim,int slic } } // the above should guarantee that the operations are local + autoView(lowDimv,lowDim,CpuWrite); + autoView(higherDimv,higherDim,CpuRead); thread_for(idx,lg->lSites(),{ sobj s; Coordinate lcoor(nl); @@ -673,8 +677,8 @@ void ExtractSlice(Lattice &lowDim,const Lattice & higherDim,int slic hcoor[d]=lcoor[ddl++]; } } - peekLocalSite(s,higherDim,hcoor); - pokeLocalSite(s,lowDim,lcoor); + peekLocalSite(s,higherDimv,hcoor); + pokeLocalSite(s,lowDimv,lcoor); }); } @@ -702,6 +706,8 @@ void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int } // the above should guarantee that the operations are local + autoView(lowDimv,lowDim,CpuRead); + autoView(higherDimv,higherDim,CpuWrite); thread_for(idx,lg->lSites(),{ sobj s; Coordinate lcoor(nl); @@ -710,8 +716,8 @@ void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int if( lcoor[orthog] == slice_lo ) { hcoor=lcoor; hcoor[orthog] = slice_hi; - peekLocalSite(s,lowDim,lcoor); - pokeLocalSite(s,higherDim,hcoor); + peekLocalSite(s,lowDimv,lcoor); + pokeLocalSite(s,higherDimv,hcoor); } }); } @@ -739,6 +745,8 @@ void ExtractSliceLocal(Lattice &lowDim,const Lattice & higherDim,int } // the above should guarantee that the operations are local + autoView(lowDimv,lowDim,CpuWrite); + autoView(higherDimv,higherDim,CpuRead); thread_for(idx,lg->lSites(),{ sobj s; Coordinate lcoor(nl); @@ -747,8 +755,8 @@ void ExtractSliceLocal(Lattice &lowDim,const Lattice & higherDim,int if( lcoor[orthog] == slice_lo ) { hcoor=lcoor; hcoor[orthog] = slice_hi; - peekLocalSite(s,higherDim,hcoor); - pokeLocalSite(s,lowDim,lcoor); + peekLocalSite(s,higherDimv,hcoor); + pokeLocalSite(s,lowDimv,lcoor); } }); } diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h index a10acd87..3b76b921 100644 --- a/Grid/lattice/Lattice_view.h +++ b/Grid/lattice/Lattice_view.h @@ -43,6 +43,8 @@ public: if (grid) conformable(grid, _grid); else grid = _grid; }; + // Host only + GridBase * getGrid(void) const { return _grid; }; }; ///////////////////////////////////////////////////////////////////////////////////////// diff --git a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h index 890c680b..0c8a0930 100644 --- a/Grid/qcd/action/fermion/DomainWallVec5dImpl.h +++ b/Grid/qcd/action/fermion/DomainWallVec5dImpl.h @@ -114,19 +114,22 @@ public: U = adj(Cshift(U, mu, -1)); PokeIndex(Uadj, U, mu); } - - for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) { + + autoView(Umu_v,Umu,CpuRead); + autoView(Uadj_v,Uadj,CpuRead); + autoView(Uds_v,Uds,CpuWrite); + thread_for( lidx, GaugeGrid->lSites(), { Coordinate lcoor; GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - peekLocalSite(ScalarUmu, Umu, lcoor); + peekLocalSite(ScalarUmu, Umu_v, lcoor); for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu); - peekLocalSite(ScalarUmu, Uadj, lcoor); + peekLocalSite(ScalarUmu, Uadj_v, lcoor); for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu); - pokeLocalSite(ScalarUds, Uds, lcoor); - } + pokeLocalSite(ScalarUds, Uds_v, lcoor); + }); } inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) diff --git a/Grid/qcd/action/fermion/StaggeredVec5dImpl.h b/Grid/qcd/action/fermion/StaggeredVec5dImpl.h index 2d4de18e..18fe993c 100644 --- a/Grid/qcd/action/fermion/StaggeredVec5dImpl.h +++ b/Grid/qcd/action/fermion/StaggeredVec5dImpl.h @@ -113,20 +113,7 @@ public: inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu) { - GridBase *GaugeGrid = U_ds.Grid(); - thread_for(lidx, GaugeGrid->lSites(),{ - - SiteScalarGaugeLink ScalarU; - SiteDoubledGaugeField ScalarUds; - - Coordinate lcoor; - GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); - peekLocalSite(ScalarUds, U_ds, lcoor); - - peekLocalSite(ScalarU, U, lcoor); - ScalarUds(mu) = ScalarU(); - - }); + assert(0); } inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &UUUds, // for Naik term diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h index 36447153..df1bce7c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h @@ -98,32 +98,35 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) Coordinate lcoor; typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); - for (int site = 0; site < lvol; site++) { - grid->LocalIndexToLocalCoor(site, lcoor); - EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); - peekLocalSite(Qx, CloverTerm, lcoor); - Qxinv = Zero(); - //if (csw!=0){ - for (int j = 0; j < Ns; j++) - for (int k = 0; k < Ns; k++) - for (int a = 0; a < DimRep; a++) - for (int b = 0; b < DimRep; b++){ - auto zz = Qx()(j, k)(a, b); - EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex(zz); - } - // if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl; - - EigenInvCloverOp = EigenCloverOp.inverse(); - //std::cout << EigenInvCloverOp << std::endl; - for (int j = 0; j < Ns; j++) - for (int k = 0; k < Ns; k++) - for (int a = 0; a < DimRep; a++) - for (int b = 0; b < DimRep; b++) - Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep); - // if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl; - // } - pokeLocalSite(Qxinv, CloverTermInv, lcoor); + autoView(CTv,CloverTerm,CpuRead); + autoView(CTIv,CloverTermInv,CpuWrite); + for (int site = 0; site < lvol; site++) { + grid->LocalIndexToLocalCoor(site, lcoor); + EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + peekLocalSite(Qx, CTv, lcoor); + Qxinv = Zero(); + //if (csw!=0){ + for (int j = 0; j < Ns; j++) + for (int k = 0; k < Ns; k++) + for (int a = 0; a < DimRep; a++) + for (int b = 0; b < DimRep; b++){ + auto zz = Qx()(j, k)(a, b); + EigenCloverOp(a + j * DimRep, b + k * DimRep) = std::complex(zz); + } + // if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl; + + EigenInvCloverOp = EigenCloverOp.inverse(); + //std::cout << EigenInvCloverOp << std::endl; + for (int j = 0; j < Ns; j++) + for (int k = 0; k < Ns; k++) + for (int a = 0; a < DimRep; a++) + for (int b = 0; b < DimRep; b++) + Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep); + // if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl; + // } + pokeLocalSite(Qxinv, CTIv, lcoor); + } } // Separate the even and odd parts diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 2a202a77..2cc308cc 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -580,16 +580,21 @@ void WilsonFermion5D::MomentumSpacePropagatorHt_5d(FermionField &out,const cosha = (one + W*W + sk) / (abs(W)*2.0); // FIXME Need a Lattice acosh - for(int idx=0;idx<_grid->lSites();idx++){ - Coordinate lcoor(Nd); - Tcomplex cc; - // RealD sgn; - _grid->LocalIndexToLocalCoor(idx,lcoor); - peekLocalSite(cc,cosha,lcoor); - assert((double)real(cc)>=1.0); - assert(fabs((double)imag(cc))<=1.0e-15); - cc = ScalComplex(::acosh(real(cc)),0.0); - pokeLocalSite(cc,a,lcoor); + + { + autoView(cosha_v,cosha,CpuRead); + autoView(a_v,a,CpuWrite); + for(int idx=0;idx<_grid->lSites();idx++){ + Coordinate lcoor(Nd); + Tcomplex cc; + // RealD sgn; + _grid->LocalIndexToLocalCoor(idx,lcoor); + peekLocalSite(cc,cosha_v,lcoor); + assert((double)real(cc)>=1.0); + assert(fabs((double)imag(cc))<=1.0e-15); + cc = ScalComplex(::acosh(real(cc)),0.0); + pokeLocalSite(cc,a_v,lcoor); + } } Wea = ( exp( a) * abs(W) ); @@ -775,17 +780,20 @@ void WilsonFermion5D::MomentumSpacePropagatorHt(FermionField &out,const Fe cosha = (one + W*W + sk) / (abs(W)*2.0); // FIXME Need a Lattice acosh + { + autoView(cosha_v,cosha,CpuRead); + autoView(a_v,a,CpuWrite); for(int idx=0;idx<_grid->lSites();idx++){ Coordinate lcoor(Nd); Tcomplex cc; // RealD sgn; _grid->LocalIndexToLocalCoor(idx,lcoor); - peekLocalSite(cc,cosha,lcoor); + peekLocalSite(cc,cosha_v,lcoor); assert((double)real(cc)>=1.0); assert(fabs((double)imag(cc))<=1.0e-15); cc = ScalComplex(::acosh(real(cc)),0.0); - pokeLocalSite(cc,a,lcoor); - } + pokeLocalSite(cc,a_v,lcoor); + }} Wea = ( exp( a) * abs(W) ); Wema= ( exp(-a) * abs(W) ); diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index cd85a784..e93f3046 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -286,8 +286,6 @@ void Grid_init(int *argc,char ***argv) ////////////////////////////////////////////////////////// acceleratorInit(); // Must come first to set device prior to MPI init due to Omnipath Driver - MemoryManager::Init(); - if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){ int MB; arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm"); @@ -358,6 +356,15 @@ void Grid_init(int *argc,char ***argv) std::cout << GridLogMessage << "MPI is initialised and logging filters activated "< Date: Fri, 19 Jun 2020 17:36:05 -0400 Subject: [PATCH 7/8] Typo fix (excusee - my keyboard is starting to break) --- Grid/algorithms/FFT.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index 765305d7..1a3e1eba 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -237,7 +237,7 @@ public: sobj s; sgrid->LocalIndexToLocalCoor(idx,cbuf); peekLocalSite(s,r_v,cbuf); - acbuf[dim]+=((pc+p) % processors[dim])*L; + cbuf[dim]+=((pc+p) % processors[dim])*L; pokeLocalSite(s,p_v,cbuf); }); if (p != processors[dim] - 1) { From 1a74816c25d199fd3cfda5a960ffa849f6aaa693 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 19 Jun 2020 17:50:52 -0400 Subject: [PATCH 8/8] Hopeefully fixed --- Grid/algorithms/FFT.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/Grid/algorithms/FFT.h b/Grid/algorithms/FFT.h index 1a3e1eba..ad42f049 100644 --- a/Grid/algorithms/FFT.h +++ b/Grid/algorithms/FFT.h @@ -230,16 +230,18 @@ public: result = source; int pc = processor_coor[dim]; for(int p=0;plSites(),{ + { + autoView(r_v,result,CpuRead); + autoView(p_v,pgbuf,CpuWrite); + thread_for(idx, sgrid->lSites(),{ Coordinate cbuf(Nd); sobj s; sgrid->LocalIndexToLocalCoor(idx,cbuf); peekLocalSite(s,r_v,cbuf); cbuf[dim]+=((pc+p) % processors[dim])*L; pokeLocalSite(s,p_v,cbuf); - }); + }); + } if (p != processors[dim] - 1) { result = Cshift(result,dim,L); } @@ -268,15 +270,19 @@ public: flops+= flops_call*NN; // writing out result - thread_for(idx,sgrid->lSites(),{ + { + autoView(pgbuf_v,pgbuf,CpuRead); + autoView(result_v,result,CpuWrite); + thread_for(idx,sgrid->lSites(),{ Coordinate clbuf(Nd), cgbuf(Nd); sobj s; sgrid->LocalIndexToLocalCoor(idx,clbuf); cgbuf = clbuf; cgbuf[dim] = clbuf[dim]+L*pc; - peekLocalSite(s,pgbuf,cgbuf); - pokeLocalSite(s,result,clbuf); - }); + peekLocalSite(s,pgbuf_v,cgbuf); + pokeLocalSite(s,result_v,clbuf); + }); + } result = result*div; // destroying plan