From 4f85672674a018f1213af057e6e3b09ada600347 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 17 Dec 2022 20:16:11 -0500 Subject: [PATCH 1/9] Simpler test for PETSc --- tests/core/Test_fft_matt.cc | 160 ++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 tests/core/Test_fft_matt.cc diff --git a/tests/core/Test_fft_matt.cc b/tests/core/Test_fft_matt.cc new file mode 100644 index 00000000..d4455a7e --- /dev/null +++ b/tests/core/Test_fft_matt.cc @@ -0,0 +1,160 @@ + /************************************************************************************* + + grid` physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_cshift.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< seeds({1,2,3,4}); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds); // naughty seeding + GridParallelRNG pRNG(&GRID); + pRNG.SeedFixedIntegers(seeds); + + LatticeGaugeFieldD Umu(&GRID); + + SU::ColdConfiguration(pRNG,Umu); // Unit gauge + + //////////////////////////////////////////////////// + // Wilson test + //////////////////////////////////////////////////// + { + LatticeFermionD src(&GRID); gaussian(pRNG,src); + LatticeFermionD tmp(&GRID); + LatticeFermionD ref(&GRID); + + RealD mass=0.01; + WilsonFermionD Dw(Umu,GRID,RBGRID,mass); + + Dw.M(src,tmp); + + std::cout << "Dw src = " < HermOp(Dw); + ConjugateGradient CG(1.0e-10,10000); + CG(HermOp,src,result); + + //////////////////////////////////////////////////////////////////////// + std::cout << " Taking difference" < Date: Tue, 14 Mar 2023 09:07:36 -0700 Subject: [PATCH 2/9] SHared compile --- Grid/allocator/MemoryManagerShared.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc index 2434ad47..e291ef89 100644 --- a/Grid/allocator/MemoryManagerShared.cc +++ b/Grid/allocator/MemoryManagerShared.cc @@ -13,7 +13,7 @@ uint64_t MemoryManager::DeviceToHostBytes; uint64_t MemoryManager::HostToDeviceXfer; uint64_t MemoryManager::DeviceToHostXfer; -void MemoryManager::Audit(void){}; +void MemoryManager::Audit(std::string s){}; void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; int MemoryManager::isOpen (void* CpuPtr) { return 0;} From cad5b187ddcbce278bdca06b42dfb4ac6bbc59d0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Mar 2023 09:08:16 -0700 Subject: [PATCH 3/9] Cleanup --- Grid/qcd/action/fermion/WilsonKernels.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index 68422f28..2d868c27 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -52,13 +52,6 @@ public: typedef AcceleratorVector StencilVector; public: -#ifdef GRID_SYCL -#define SYCL_HACK -#endif -#ifdef SYCL_HACK - static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf, - int ss,int sU,const SiteSpinor *in, SiteSpinor *out); -#endif static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, From eeb6e0a6e371407ec4d445f7d0459e6a951c2449 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Mar 2023 09:10:27 -0700 Subject: [PATCH 4/9] Renable cache blocking and efficient UPI type SHM comms --- .../WilsonFermion5DImplementation.h | 4 + .../WilsonFermionImplementation.h | 3 + .../WilsonKernelsImplementation.h | 3 +- Grid/stencil/Stencil.h | 104 ++++++++++++++++-- configure.ac | 8 ++ 5 files changed, 112 insertions(+), 10 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h index 4ca24789..1ddb30ba 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermion5DImplementation.h @@ -63,6 +63,10 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, _tmp(&FiveDimRedBlackGrid), Dirichlet(0) { + Stencil.lo = &Lebesgue; + StencilEven.lo = &LebesgueEvenOdd; + StencilOdd.lo = &LebesgueEvenOdd; + // some assertions assert(FiveDimGrid._ndimension==5); assert(FourDimGrid._ndimension==4); diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 2833fdc4..1a262533 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -60,6 +60,9 @@ WilsonFermion::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid, _tmp(&Hgrid), anisotropyCoeff(anis) { + Stencil.lo = &Lebesgue; + StencilEven.lo = &LebesgueEvenOdd; + StencilOdd.lo = &LebesgueEvenOdd; // Allocate the required comms buffer ImportGauge(_Umu); if (anisotropyCoeff.isAnisotropic){ diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index bdba7cb2..b307fad4 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -433,7 +433,8 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S }); #define ASM_CALL(A) \ - thread_for( ss, Nsite, { \ + thread_for( sss, Nsite, { \ + int ss = st.lo->Reorder(sss); \ int sU = ss; \ int sF = ss*Ls; \ WilsonKernels::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index cffede12..a74b720d 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -290,9 +290,9 @@ public: protected: GridBase * _grid; - public: GridBase *Grid(void) const { return _grid; } + LebesgueOrder *lo; //////////////////////////////////////////////////////////////////////// // Needed to conveniently communicate gparity parameters into GPU memory @@ -337,6 +337,7 @@ public: //////////////////////////////////////// // Stencil query //////////////////////////////////////// +#ifdef SHM_FAST_PATH inline int SameNode(int point) { int dimension = this->_directions[point]; @@ -356,7 +357,40 @@ public: if ( displacement == 0 ) return 1; return 0; } +#else + // + inline int SameNode(int point) { + int dimension = this->_directions[point]; + int displacement = this->_distances[point]; + + int pd = _grid->_processors[dimension]; + int fd = _grid->_fdimensions[dimension]; + int ld = _grid->_ldimensions[dimension]; + int rd = _grid->_rdimensions[dimension]; + int simd_layout = _grid->_simd_layout[dimension]; + int comm_dim = _grid->_processors[dimension] >1 ; + + int recv_from_rank; + int xmit_to_rank; + + if ( ! comm_dim ) return 1; + + int nbr_proc; + if (displacement>0) nbr_proc = 1; + else nbr_proc = pd-1; + + // FIXME this logic needs to be sorted for three link term + // assert( (displacement==1) || (displacement==-1)); + // Present hack only works for >= 4^4 subvol per node + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p); + + if ( (shm==NULL) ) return 0; + return 1; + } +#endif ////////////////////////////////////////// // Comms packet queue for asynch thread // Use OpenMP Tasks for cleaner ??? @@ -1056,7 +1090,7 @@ public: int comms_recv = this->_comms_recv[point]; int comms_partial_send = this->_comms_partial_send[point] ; int comms_partial_recv = this->_comms_partial_recv[point] ; - + assert(rhs.Grid()==_grid); // conformable(_grid,rhs.Grid()); @@ -1127,11 +1161,32 @@ public: recv_buf=this->u_recv_buf_p; } + // potential SHM fast path for intranode + int shm_send=0; + int shm_recv=0; +#ifdef SHM_FAST_PATH + // Put directly in place if we can + send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,recv_buf); + if ( (send_buf==NULL) ) { + shm_send=0; + send_buf = this->u_send_buf_p; + } else { + shm_send=1; + } + void *test_ptr = _grid->ShmBufferTranslate(recv_from_rank,recv_buf); + if ( test_ptr != NULL ) shm_recv = 1; + // static int printed; + // if (!printed){ + // std::cout << " GATHER FAST PATH SHM "<u_send_buf_p; // Gather locally, must send assert(send_buf!=NULL); +#endif // std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - +#ifdef SHM_FAST_PATH + #warning STENCIL SHM FAST PATH SELECTED + // shm == receive pointer if offnode + // shm == Translate[send pointer] if on node -- my view of his send pointer + cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp); + if (shm==NULL) { + shm = rp; + // we found a packet that comes from MPI and contributes to this shift. + // is_same_node is only used in the WilsonStencil, and gets set for this point in the stencil. + // Kernel will add the exterior_terms except if is_same_node. + // leg of stencil + shm_recv=0; + } else { + shm_recv=1; + } + rpointers[i] = shm; + // Test send side + void *test_ptr = (void *) _grid->ShmBufferTranslate(xmit_to_rank,sp); + if ( test_ptr != NULL ) shm_send = 1; + // static int printed; + // if (!printed){ + // std::cout << " GATHERSIMD FAST PATH SHM "<u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); } diff --git a/configure.ac b/configure.ac index 2e6199c7..fedca3fe 100644 --- a/configure.ac +++ b/configure.ac @@ -646,6 +646,14 @@ case ${ac_SHM_FORCE_MPI} in ;; *) ;; esac +############### force MPI in SMP +AC_ARG_ENABLE([shm-fast-path],[AS_HELP_STRING([--enable-shm-fast-path],[Allow kernels to remote copy over intranode])],[ac_SHM_FAST_PATH=${enable_shm_fast_path}],[ac_SHM_FAST_PATH=no]) +case ${ac_SHM_FAST_PATH} in + yes) + AC_DEFINE([SHM_FAST_PATH],[1],[SHM_FAST_PATH] ) + ;; + *) ;; +esac ############### communication type selection AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes]) From f36b87deb5451a3c1d47c4a25502319395478c3e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Mar 2023 12:09:00 -0700 Subject: [PATCH 5/9] syscall fix --- Grid/communicator/SharedMemoryMPI.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 4993a02e..9a273dc4 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -29,6 +29,7 @@ Author: Christoph Lehner #include #include +#include #ifdef GRID_CUDA #include From 14cc142a1424d02883887572cef47d1677a91fb1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Mar 2023 12:09:26 -0700 Subject: [PATCH 6/9] Warning remove --- Grid/stencil/Stencil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index a74b720d..29aa876f 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -387,7 +387,7 @@ public: void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,this->u_recv_buf_p); - if ( (shm==NULL) ) return 0; + if ( shm==NULL ) return 0; return 1; } #endif From 861e5d7f4c0bb56cbb5de550cb2a14e5c1ad15f7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Mar 2023 12:10:02 -0700 Subject: [PATCH 7/9] SYCL version update. Why do they keep making incompatible changes --- Grid/threads/Accelerator.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 5ac36d15..04ae885b 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -248,17 +248,23 @@ inline int acceleratorIsCommunicable(void *ptr) ////////////////////////////////////////////// // SyCL acceleration ////////////////////////////////////////////// -#ifdef GRID_SYCL -NAMESPACE_END(Grid); -#include -#include +#ifdef GRID_SYCL #define GRID_SYCL_LEVEL_ZERO_IPC -#ifdef GRID_SYCL_LEVEL_ZERO_IPC +NAMESPACE_END(Grid); +#if 0 +#include +#include #include #include +#else +#include +#include +#include +#include #endif + NAMESPACE_BEGIN(Grid); extern cl::sycl::queue *theGridAccelerator; From a997d24743abb33c2ec1778bafd805793d1919f7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 14 Mar 2023 12:10:31 -0700 Subject: [PATCH 8/9] Remove nofma --- systems/PVC/config-command | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systems/PVC/config-command b/systems/PVC/config-command index 3f5b5993..dc6b222c 100644 --- a/systems/PVC/config-command +++ b/systems/PVC/config-command @@ -11,5 +11,5 @@ INSTALL=/nfs/site/home/azusayax/install --enable-unified=yes \ CXX=mpicxx \ LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \ - CXXFLAGS="-cxx=dpcpp -fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare" + CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-constant-compare" From e1c326558a54a38e138e9eeaead1aaabaab57440 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 21 Mar 2023 08:53:56 -0700 Subject: [PATCH 9/9] COmms improvements --- Grid/qcd/action/fermion/WilsonCompressor.h | 34 ++++++++++--------- .../WilsonKernelsImplementation.h | 11 ++++++ Grid/stencil/Stencil.h | 4 +-- Grid/threads/Accelerator.h | 4 +-- systems/PVC/benchmarks/run-1tile.sh | 2 +- systems/PVC/benchmarks/run-2tile-mpi.sh | 4 +-- systems/PVC/benchmarks/wrap.sh | 10 +++--- systems/PVC/config-command | 6 ++-- systems/PVC/setup.sh | 3 +- 9 files changed, 46 insertions(+), 32 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index fd1bbe89..5523ae8a 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -484,24 +484,26 @@ public: int dag = compress.dag; int face_idx=0; +#define vet_same_node(a,b) \ + { auto tmp = b; } if ( dag ) { - assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); - assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); - assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); - assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); - assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); - assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); - assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); - assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); + vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XpCompress,Xp,face_idx)); + vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YpCompress,Yp,face_idx)); + vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); + vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TpCompress,Tp,face_idx)); + vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XmCompress,Xm,face_idx)); + vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YmCompress,Ym,face_idx)); + vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); + vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TmCompress,Tm,face_idx)); } else { - assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); - assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); - assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); - assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); - assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); - assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); - assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); - assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); + vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XmCompress,Xp,face_idx)); + vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YmCompress,Yp,face_idx)); + vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); + vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TmCompress,Tp,face_idx)); + vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XpCompress,Xm,face_idx)); + vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YpCompress,Ym,face_idx)); + vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); + vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx)); } this->face_table_computed=1; assert(this->u_comm_offset==this->_unified_buffer_size); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index b307fad4..fcf1f1f3 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -439,6 +439,17 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S int sF = ss*Ls; \ WilsonKernels::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ }); +#define ASM_CALL_SLICE(A) \ + auto grid = in.Grid() ; \ + int nt = grid->LocalDimensions()[4]; \ + int nxyz = Nsite/nt ; \ + for(int t=0;t::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ + });} template void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 29aa876f..c8703b9f 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -358,7 +358,7 @@ public: return 0; } #else - // + // fancy calculation for shm code inline int SameNode(int point) { int dimension = this->_directions[point]; @@ -378,7 +378,7 @@ public: int nbr_proc; if (displacement>0) nbr_proc = 1; - else nbr_proc = pd-1; + else nbr_proc = pd-1; // FIXME this logic needs to be sorted for three link term // assert( (displacement==1) || (displacement==-1)); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 04ae885b..2dde1433 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -305,14 +305,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { }); \ }); -#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n"); theGridAccelerator->wait(); } +#define accelerator_barrier(dummy) { theGridAccelerator->wait(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; -inline void acceleratorCopySynchronise(void) { printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); } +inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); } inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} diff --git a/systems/PVC/benchmarks/run-1tile.sh b/systems/PVC/benchmarks/run-1tile.sh index 923afd84..3c594ab6 100644 --- a/systems/PVC/benchmarks/run-1tile.sh +++ b/systems/PVC/benchmarks/run-1tile.sh @@ -21,7 +21,7 @@ export I_MPI_OFFLOAD_CELL=tile export EnableImplicitScaling=0 export EnableWalkerPartition=0 export ZE_AFFINITY_MASK=0.0 -mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8 +mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 export ZE_AFFINITY_MASK=0 export I_MPI_OFFLOAD_CELL=device diff --git a/systems/PVC/benchmarks/run-2tile-mpi.sh b/systems/PVC/benchmarks/run-2tile-mpi.sh index 9db0b66b..fa56d5ec 100755 --- a/systems/PVC/benchmarks/run-2tile-mpi.sh +++ b/systems/PVC/benchmarks/run-2tile-mpi.sh @@ -20,7 +20,7 @@ export I_MPI_OFFLOAD_CELL=tile export EnableImplicitScaling=0 export EnableWalkerPartition=0 -mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 1tile.log +#mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log -mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 2tile.log +mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 diff --git a/systems/PVC/benchmarks/wrap.sh b/systems/PVC/benchmarks/wrap.sh index a352fff9..0e48625b 100755 --- a/systems/PVC/benchmarks/wrap.sh +++ b/systems/PVC/benchmarks/wrap.sh @@ -5,10 +5,10 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK -if [ $MPI_LOCALRANKID = "0" ] -then +#if [ $MPI_LOCALRANKID = "0" ] +#then # ~psteinbr/build_pti/ze_tracer -c $@ - onetrace --chrome-kernel-timeline $@ -else +# onetrace --chrome-kernel-timeline $@ +#else $@ -fi +#fi diff --git a/systems/PVC/config-command b/systems/PVC/config-command index dc6b222c..7549f2b4 100644 --- a/systems/PVC/config-command +++ b/systems/PVC/config-command @@ -1,4 +1,4 @@ -INSTALL=/nfs/site/home/azusayax/install +INSTALL=/nfs/site/home/paboylx/prereqs/ ../../configure \ --enable-simd=GPU \ --enable-gen-simd-width=64 \ @@ -8,8 +8,8 @@ INSTALL=/nfs/site/home/azusayax/install --disable-fermion-reps \ --enable-shm=nvlink \ --enable-accelerator=sycl \ - --enable-unified=yes \ + --enable-unified=no \ CXX=mpicxx \ LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \ - CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-constant-compare" + CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -Wno-tautological-constant-compare -I$INSTALL/include" diff --git a/systems/PVC/setup.sh b/systems/PVC/setup.sh index 2a6f920b..9b515a62 100644 --- a/systems/PVC/setup.sh +++ b/systems/PVC/setup.sh @@ -1,5 +1,6 @@ export https_proxy=http://proxy-chain.intel.com:911 -export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH +#export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH module load intel-release source /opt/intel/oneapi/PVC_setup.sh