diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index a9a82f34..ba4abecd 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -262,7 +262,7 @@ public: autoView( Tnp_v , (*Tnp), AcceleratorWrite); autoView( Tnm_v , (*Tnm), AcceleratorWrite); const int Nsimd = CComplex::Nsimd(); - accelerator_forNB(ss, FineGrid->oSites(), Nsimd, { + accelerator_for(ss, FineGrid->oSites(), Nsimd, { coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); }); diff --git a/Grid/algorithms/approx/Chebyshev.h b/Grid/algorithms/approx/Chebyshev.h index 584ed1d5..7c93f0b8 100644 --- a/Grid/algorithms/approx/Chebyshev.h +++ b/Grid/algorithms/approx/Chebyshev.h @@ -264,7 +264,7 @@ public: auto Tnp_v = Tnp->View(); auto Tnm_v = Tnm->View(); constexpr int Nsimd = vector_type::Nsimd(); - accelerator_forNB(ss, in.Grid()->oSites(), Nsimd, { + accelerator_for(ss, in.Grid()->oSites(), Nsimd, { coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss)); coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss)); }); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index fef4ea1f..cbdd224d 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -395,12 +395,11 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vectorStencilSendToRecvFromComplete(list,dir); - * list.resize(0); - * } - */ + + // if ( CommunicatorPolicy == CommunicatorPolicySequential ) { + // this->StencilSendToRecvFromComplete(list,dir); + // } + return off_node_bytes; } void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector &list,int dir) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 939fda33..bdba7cb2 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -490,6 +490,7 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} #endif + acceleratorFenceComputeStream(); } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} @@ -497,11 +498,13 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif } else if( exterior ) { + acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} #endif + acceleratorFenceComputeStream(); } assert(0 && " Kernel optimisation case not covered "); } diff --git a/Grid/qcd/action/gauge/Photon.h b/Grid/qcd/action/gauge/Photon.h index 465aa8bd..3d4baccd 100644 --- a/Grid/qcd/action/gauge/Photon.h +++ b/Grid/qcd/action/gauge/Photon.h @@ -49,7 +49,7 @@ NAMESPACE_BEGIN(Grid); typedef Lattice LinkField; typedef Lattice Field; - typedef Field ComplexField; + typedef LinkField ComplexField; }; typedef QedGImpl QedGImplR; diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 7dfbc4ff..70f469b0 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -201,12 +201,15 @@ void acceleratorInit(void) #ifdef GRID_SYCL cl::sycl::queue *theGridAccelerator; +cl::sycl::queue *theCopyAccelerator; void acceleratorInit(void) { int nDevices = 1; cl::sycl::gpu_selector selector; cl::sycl::device selectedDevice { selector }; theGridAccelerator = new sycl::queue (selectedDevice); + // theCopyAccelerator = new sycl::queue (selectedDevice); + theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. #ifdef GRID_SYCL_LEVEL_ZERO_IPC zeInit(0); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 4e476abb..5ac36d15 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -248,7 +248,6 @@ inline int acceleratorIsCommunicable(void *ptr) ////////////////////////////////////////////// // SyCL acceleration ////////////////////////////////////////////// - #ifdef GRID_SYCL NAMESPACE_END(Grid); #include @@ -263,6 +262,7 @@ NAMESPACE_END(Grid); NAMESPACE_BEGIN(Grid); extern cl::sycl::queue *theGridAccelerator; +extern cl::sycl::queue *theCopyAccelerator; #ifdef __SYCL_DEVICE_ONLY__ #define GRID_SIMT @@ -290,7 +290,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { cgh.parallel_for( \ cl::sycl::nd_range<3>(global,local), \ [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ - [[intel::reqd_sub_group_size(8)]] \ + [[intel::reqd_sub_group_size(16)]] \ { \ auto iter1 = item.get_global_id(0); \ auto iter2 = item.get_global_id(1); \ @@ -299,19 +299,19 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { }); \ }); -#define accelerator_barrier(dummy) theGridAccelerator->wait(); +#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n"); theGridAccelerator->wait(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { - theGridAccelerator->memcpy(to,from,bytes); -} -inline void acceleratorCopySynchronise(void) { theGridAccelerator->wait(); std::cout<<"acceleratorCopySynchronise() wait "<memcpy(to,from,bytes); theGridAccelerator->wait();} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} -inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();} + +inline void acceleratorCopySynchronise(void) { printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); } +inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} +inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} +inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();} + inline int acceleratorIsCommunicable(void *ptr) { #if 0 @@ -514,7 +514,16 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN, inline void acceleratorFreeCpu (void *ptr){free(ptr);}; #endif +////////////////////////////////////////////// +// Fencing needed ONLY for SYCL +////////////////////////////////////////////// +#ifdef GRID_SYCL +inline void acceleratorFenceComputeStream(void){ accelerator_barrier();}; +#else +// Ordering within a stream guaranteed on Nvidia & AMD +inline void acceleratorFenceComputeStream(void){ }; +#endif /////////////////////////////////////////////////// // Synchronise across local threads for divergence resynch diff --git a/Grid/util/Sha.h b/Grid/util/Sha.h index ee164c34..f3789ac4 100644 --- a/Grid/util/Sha.h +++ b/Grid/util/Sha.h @@ -27,6 +27,7 @@ /* END LEGAL */ extern "C" { #include +#include } #ifdef USE_IPP #include "ipp.h" @@ -70,10 +71,8 @@ public: static inline std::vector sha256(const void *data,size_t bytes) { std::vector hash(SHA256_DIGEST_LENGTH); - SHA256_CTX sha256; - SHA256_Init (&sha256); - SHA256_Update(&sha256, data,bytes); - SHA256_Final (&hash[0], &sha256); + auto digest = EVP_get_digestbyname("SHA256"); + EVP_Digest(data, bytes, &hash[0], NULL, digest, NULL); return hash; } static inline std::vector sha256_seeds(const std::string &s) diff --git a/README.md b/README.md index 88b922a5..4af52d78 100644 --- a/README.md +++ b/README.md @@ -148,7 +148,7 @@ If you want to build all the tests at once just use `make tests`. - `--enable-mkl[=]`: use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional). - `--enable-numa`: enable NUMA first touch optimisation - `--enable-simd=`: setup Grid for the SIMD target `` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. -- `--enable-gen-simd-width=`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). +- `--enable-gen-simd-width=`: select the size (in bytes) of the generic SIMD vector type (default: 64 bytes). - `--enable-comms=`: Use `` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `). - `--disable-timers`: disable system dependent high-resolution timers. diff --git a/systems/PVC/benchmarks/run-1tile.sh b/systems/PVC/benchmarks/run-1tile.sh new file mode 100644 index 00000000..923afd84 --- /dev/null +++ b/systems/PVC/benchmarks/run-1tile.sh @@ -0,0 +1,62 @@ +#!/bin/sh +##SBATCH -p PVC-SPR-QZEH +##SBATCH -p PVC-ICX-QZNW +#SBATCH -p QZ1J-ICX-PVC +##SBATCH -p QZ1J-SPR-PVC-2C + +source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh + +export NT=16 + +export I_MPI_OFFLOAD=1 +export I_MPI_OFFLOAD_TOPOLIB=level_zero +export I_MPI_OFFLOAD_DOMAIN_SIZE=-1 + +# export IGC_EnableLSCFenceUGMBeforeEOT=0 +# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False" +export SYCL_DEVICE_FILTER=gpu,level_zero +#export IGC_ShaderDumpEnable=1 +#export IGC_DumpToCurrentDir=1 +export I_MPI_OFFLOAD_CELL=tile +export EnableImplicitScaling=0 +export EnableWalkerPartition=0 +export ZE_AFFINITY_MASK=0.0 +mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8 + +export ZE_AFFINITY_MASK=0 +export I_MPI_OFFLOAD_CELL=device +export EnableImplicitScaling=1 +export EnableWalkerPartition=1 + + + + + + + + + + + + + + + + + + + + +#mpiexec -launcher ssh -n 2 -host localhost vtune -collect gpu-hotspots -knob gpu-sampling-interval=1 -data-limit=0 -r ./vtune_run4 -- ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1 + +#mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1 + +#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 + +#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1 + +#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 + +#mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 16.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 0 +#mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 1 + diff --git a/systems/PVC/benchmarks/run-2tile-mpi.sh b/systems/PVC/benchmarks/run-2tile-mpi.sh new file mode 100755 index 00000000..9db0b66b --- /dev/null +++ b/systems/PVC/benchmarks/run-2tile-mpi.sh @@ -0,0 +1,26 @@ +#!/bin/bash +##SBATCH -p PVC-SPR-QZEH +##SBATCH -p PVC-ICX-QZNW +#SBATCH -p QZ1J-ICX-PVC + +source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh + +export NT=16 + + +# export IGC_EnableLSCFenceUGMBeforeEOT=0 +# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False" +#export IGC_ShaderDumpEnable=1 +#export IGC_DumpToCurrentDir=1 +export I_MPI_OFFLOAD=1 +export I_MPI_OFFLOAD_TOPOLIB=level_zero +export I_MPI_OFFLOAD_DOMAIN_SIZE=-1 +export SYCL_DEVICE_FILTER=gpu,level_zero +export I_MPI_OFFLOAD_CELL=tile +export EnableImplicitScaling=0 +export EnableWalkerPartition=0 + +mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 1tile.log + +mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 2tile.log + diff --git a/systems/PVC/benchmarks/wrap.sh b/systems/PVC/benchmarks/wrap.sh new file mode 100755 index 00000000..a352fff9 --- /dev/null +++ b/systems/PVC/benchmarks/wrap.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID + +echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK + + +if [ $MPI_LOCALRANKID = "0" ] +then +# ~psteinbr/build_pti/ze_tracer -c $@ + onetrace --chrome-kernel-timeline $@ +else + $@ +fi diff --git a/systems/PVC/config-command b/systems/PVC/config-command new file mode 100644 index 00000000..3f5b5993 --- /dev/null +++ b/systems/PVC/config-command @@ -0,0 +1,15 @@ +INSTALL=/nfs/site/home/azusayax/install +../../configure \ + --enable-simd=GPU \ + --enable-gen-simd-width=64 \ + --enable-comms=mpi \ + --disable-accelerator-cshift \ + --disable-gparity \ + --disable-fermion-reps \ + --enable-shm=nvlink \ + --enable-accelerator=sycl \ + --enable-unified=yes \ + CXX=mpicxx \ + LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \ + CXXFLAGS="-cxx=dpcpp -fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare" + diff --git a/systems/PVC/setup.sh b/systems/PVC/setup.sh new file mode 100644 index 00000000..2a6f920b --- /dev/null +++ b/systems/PVC/setup.sh @@ -0,0 +1,11 @@ +export https_proxy=http://proxy-chain.intel.com:911 +export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH + +module load intel-release +source /opt/intel/oneapi/PVC_setup.sh +#source /opt/intel/oneapi/ATS_setup.sh +module load intel/mpich/pvc45.3 +export PATH=~/ATS/pti-gpu/tools/onetrace/:$PATH + +#clsh embargo-ci-neo-022845 +#source /opt/intel/vtune_amplifier/amplxe-vars.sh diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 468bc982..16205ee1 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -793,6 +793,7 @@ int main (int argc, char ** argv) } std::cout <<" OK ! "<