diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index fd1bbe89..5523ae8a 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -484,24 +484,26 @@ public: int dag = compress.dag; int face_idx=0; +#define vet_same_node(a,b) \ + { auto tmp = b; } if ( dag ) { - assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); - assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); - assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); - assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); - assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); - assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); - assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); - assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); + vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XpCompress,Xp,face_idx)); + vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YpCompress,Yp,face_idx)); + vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); + vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TpCompress,Tp,face_idx)); + vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XmCompress,Xm,face_idx)); + vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YmCompress,Ym,face_idx)); + vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); + vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TmCompress,Tm,face_idx)); } else { - assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); - assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); - assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); - assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); - assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); - assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); - assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); - assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); + vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XmCompress,Xp,face_idx)); + vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YmCompress,Yp,face_idx)); + vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); + vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TmCompress,Tp,face_idx)); + vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XpCompress,Xm,face_idx)); + vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YpCompress,Ym,face_idx)); + vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); + vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx)); } this->face_table_computed=1; assert(this->u_comm_offset==this->_unified_buffer_size); diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index b307fad4..fcf1f1f3 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -439,6 +439,17 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S int sF = ss*Ls; \ WilsonKernels::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ }); +#define ASM_CALL_SLICE(A) \ + auto grid = in.Grid() ; \ + int nt = grid->LocalDimensions()[4]; \ + int nxyz = Nsite/nt ; \ + for(int t=0;t::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ + });} template void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 29aa876f..c8703b9f 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -358,7 +358,7 @@ public: return 0; } #else - // + // fancy calculation for shm code inline int SameNode(int point) { int dimension = this->_directions[point]; @@ -378,7 +378,7 @@ public: int nbr_proc; if (displacement>0) nbr_proc = 1; - else nbr_proc = pd-1; + else nbr_proc = pd-1; // FIXME this logic needs to be sorted for three link term // assert( (displacement==1) || (displacement==-1)); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 04ae885b..2dde1433 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -305,14 +305,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { }); \ }); -#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n"); theGridAccelerator->wait(); } +#define accelerator_barrier(dummy) { theGridAccelerator->wait(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; -inline void acceleratorCopySynchronise(void) { printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); } +inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); } inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} diff --git a/systems/PVC/benchmarks/run-1tile.sh b/systems/PVC/benchmarks/run-1tile.sh index 923afd84..3c594ab6 100644 --- a/systems/PVC/benchmarks/run-1tile.sh +++ b/systems/PVC/benchmarks/run-1tile.sh @@ -21,7 +21,7 @@ export I_MPI_OFFLOAD_CELL=tile export EnableImplicitScaling=0 export EnableWalkerPartition=0 export ZE_AFFINITY_MASK=0.0 -mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8 +mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 export ZE_AFFINITY_MASK=0 export I_MPI_OFFLOAD_CELL=device diff --git a/systems/PVC/benchmarks/run-2tile-mpi.sh b/systems/PVC/benchmarks/run-2tile-mpi.sh index 9db0b66b..fa56d5ec 100755 --- a/systems/PVC/benchmarks/run-2tile-mpi.sh +++ b/systems/PVC/benchmarks/run-2tile-mpi.sh @@ -20,7 +20,7 @@ export I_MPI_OFFLOAD_CELL=tile export EnableImplicitScaling=0 export EnableWalkerPartition=0 -mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 1tile.log +#mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log -mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 2tile.log +mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 diff --git a/systems/PVC/benchmarks/wrap.sh b/systems/PVC/benchmarks/wrap.sh index a352fff9..0e48625b 100755 --- a/systems/PVC/benchmarks/wrap.sh +++ b/systems/PVC/benchmarks/wrap.sh @@ -5,10 +5,10 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK -if [ $MPI_LOCALRANKID = "0" ] -then +#if [ $MPI_LOCALRANKID = "0" ] +#then # ~psteinbr/build_pti/ze_tracer -c $@ - onetrace --chrome-kernel-timeline $@ -else +# onetrace --chrome-kernel-timeline $@ +#else $@ -fi +#fi diff --git a/systems/PVC/config-command b/systems/PVC/config-command index dc6b222c..7549f2b4 100644 --- a/systems/PVC/config-command +++ b/systems/PVC/config-command @@ -1,4 +1,4 @@ -INSTALL=/nfs/site/home/azusayax/install +INSTALL=/nfs/site/home/paboylx/prereqs/ ../../configure \ --enable-simd=GPU \ --enable-gen-simd-width=64 \ @@ -8,8 +8,8 @@ INSTALL=/nfs/site/home/azusayax/install --disable-fermion-reps \ --enable-shm=nvlink \ --enable-accelerator=sycl \ - --enable-unified=yes \ + --enable-unified=no \ CXX=mpicxx \ LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \ - CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-constant-compare" + CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -Wno-tautological-constant-compare -I$INSTALL/include" diff --git a/systems/PVC/setup.sh b/systems/PVC/setup.sh index 2a6f920b..9b515a62 100644 --- a/systems/PVC/setup.sh +++ b/systems/PVC/setup.sh @@ -1,5 +1,6 @@ export https_proxy=http://proxy-chain.intel.com:911 -export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH +#export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH module load intel-release source /opt/intel/oneapi/PVC_setup.sh