1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-09-20 09:15:38 +01:00

COmms improvements

This commit is contained in:
Peter Boyle 2023-03-21 08:53:56 -07:00
parent a997d24743
commit e1c326558a
9 changed files with 46 additions and 32 deletions

View File

@ -484,24 +484,26 @@ public:
int dag = compress.dag; int dag = compress.dag;
int face_idx=0; int face_idx=0;
#define vet_same_node(a,b) \
{ auto tmp = b; }
if ( dag ) { if ( dag ) {
assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XpCompress,Xp,face_idx));
assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YpCompress,Yp,face_idx));
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx)); vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TpCompress,Tp,face_idx));
assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx)); vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XmCompress,Xm,face_idx));
assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx)); vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YmCompress,Ym,face_idx));
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx)); vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TmCompress,Tm,face_idx));
} else { } else {
assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx)); vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XmCompress,Xp,face_idx));
assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx)); vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YmCompress,Yp,face_idx));
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx)); vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TmCompress,Tp,face_idx));
assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx)); vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XpCompress,Xm,face_idx));
assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx)); vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YpCompress,Ym,face_idx));
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx)); vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx));
} }
this->face_table_computed=1; this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size); assert(this->u_comm_offset==this->_unified_buffer_size);

View File

@ -439,6 +439,17 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
int sF = ss*Ls; \ int sF = ss*Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
}); });
#define ASM_CALL_SLICE(A) \
auto grid = in.Grid() ; \
int nt = grid->LocalDimensions()[4]; \
int nxyz = Nsite/nt ; \
for(int t=0;t<nt;t++){ \
thread_for( sss, nxyz, { \
int ss = t*nxyz+sss; \
int sU = ss; \
int sF = ss*Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
});}
template <class Impl> template <class Impl>
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,

View File

@ -358,7 +358,7 @@ public:
return 0; return 0;
} }
#else #else
// // fancy calculation for shm code
inline int SameNode(int point) { inline int SameNode(int point) {
int dimension = this->_directions[point]; int dimension = this->_directions[point];
@ -378,7 +378,7 @@ public:
int nbr_proc; int nbr_proc;
if (displacement>0) nbr_proc = 1; if (displacement>0) nbr_proc = 1;
else nbr_proc = pd-1; else nbr_proc = pd-1;
// FIXME this logic needs to be sorted for three link term // FIXME this logic needs to be sorted for three link term
// assert( (displacement==1) || (displacement==-1)); // assert( (displacement==1) || (displacement==-1));

View File

@ -305,14 +305,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
}); \ }); \
}); });
#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n"); theGridAccelerator->wait(); } #define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopySynchronise(void) { printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); } inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}

View File

@ -21,7 +21,7 @@ export I_MPI_OFFLOAD_CELL=tile
export EnableImplicitScaling=0 export EnableImplicitScaling=0
export EnableWalkerPartition=0 export EnableWalkerPartition=0
export ZE_AFFINITY_MASK=0.0 export ZE_AFFINITY_MASK=0.0
mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8 mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0
export ZE_AFFINITY_MASK=0 export ZE_AFFINITY_MASK=0
export I_MPI_OFFLOAD_CELL=device export I_MPI_OFFLOAD_CELL=device

View File

@ -20,7 +20,7 @@ export I_MPI_OFFLOAD_CELL=tile
export EnableImplicitScaling=0 export EnableImplicitScaling=0
export EnableWalkerPartition=0 export EnableWalkerPartition=0
mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 1tile.log #mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 2tile.log mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0

View File

@ -5,10 +5,10 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
if [ $MPI_LOCALRANKID = "0" ] #if [ $MPI_LOCALRANKID = "0" ]
then #then
# ~psteinbr/build_pti/ze_tracer -c $@ # ~psteinbr/build_pti/ze_tracer -c $@
onetrace --chrome-kernel-timeline $@ # onetrace --chrome-kernel-timeline $@
else #else
$@ $@
fi #fi

View File

@ -1,4 +1,4 @@
INSTALL=/nfs/site/home/azusayax/install INSTALL=/nfs/site/home/paboylx/prereqs/
../../configure \ ../../configure \
--enable-simd=GPU \ --enable-simd=GPU \
--enable-gen-simd-width=64 \ --enable-gen-simd-width=64 \
@ -8,8 +8,8 @@ INSTALL=/nfs/site/home/azusayax/install
--disable-fermion-reps \ --disable-fermion-reps \
--enable-shm=nvlink \ --enable-shm=nvlink \
--enable-accelerator=sycl \ --enable-accelerator=sycl \
--enable-unified=yes \ --enable-unified=no \
CXX=mpicxx \ CXX=mpicxx \
LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \ LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-constant-compare" CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -Wno-tautological-constant-compare -I$INSTALL/include"

View File

@ -1,5 +1,6 @@
export https_proxy=http://proxy-chain.intel.com:911 export https_proxy=http://proxy-chain.intel.com:911
export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH #export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH
module load intel-release module load intel-release
source /opt/intel/oneapi/PVC_setup.sh source /opt/intel/oneapi/PVC_setup.sh