diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 87c3bf68..621e4ec5 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -28,6 +28,9 @@ Author: Christoph Lehner #if defined(GRID_CUDA)||defined(GRID_HIP) #include #endif +#if defined(GRID_SYCL) +#include +#endif NAMESPACE_BEGIN(Grid); @@ -124,7 +127,7 @@ inline Double max(const Double *arg, Integer osites) template inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) { -#if defined(GRID_CUDA)||defined(GRID_HIP) +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) return sum_gpu(arg,osites); #else return sum_cpu(arg,osites); @@ -133,7 +136,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) template inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) { -#if defined(GRID_CUDA)||defined(GRID_HIP) +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) return sumD_gpu(arg,osites); #else return sumD_cpu(arg,osites); @@ -142,7 +145,7 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) template inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) { -#if defined(GRID_CUDA)||defined(GRID_HIP) +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) return sumD_gpu_large(arg,osites); #else return sumD_cpu(arg,osites); @@ -152,13 +155,13 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) template inline typename vobj::scalar_object sum(const Lattice &arg) { -#if defined(GRID_CUDA)||defined(GRID_HIP) - autoView( arg_v, arg, AcceleratorRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum_gpu(&arg_v[0],osites); +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) + typename vobj::scalar_object ssum; + autoView( arg_v, arg, AcceleratorRead); + ssum= sum_gpu(&arg_v[0],osites); #else autoView(arg_v, arg, CpuRead); - Integer osites = arg.Grid()->oSites(); auto ssum= sum_cpu(&arg_v[0],osites); #endif arg.Grid()->GlobalSum(ssum); @@ -168,7 +171,7 @@ inline typename vobj::scalar_object sum(const Lattice &arg) template inline typename vobj::scalar_object sum_large(const Lattice &arg) { -#if defined(GRID_CUDA)||defined(GRID_HIP) +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) autoView( arg_v, arg, AcceleratorRead); Integer osites = arg.Grid()->oSites(); auto ssum= sum_gpu_large(&arg_v[0],osites); @@ -232,11 +235,10 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & typedef decltype(innerProductD(vobj(),vobj())) inner_t; Vector inner_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; - { autoView( left_v , left, AcceleratorRead); autoView( right_v,right, AcceleratorRead); - + // This code could read coalesce // GPU - SIMT lane compliance... accelerator_for( ss, sites, nsimd,{ auto x_l = left_v(ss); diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h new file mode 100644 index 00000000..90980c4c --- /dev/null +++ b/Grid/lattice/Lattice_reduction_sycl.h @@ -0,0 +1,125 @@ +NAMESPACE_BEGIN(Grid); + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +// Possibly promote to double and sum +///////////////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_objectD sobjD; + sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator); + sobj identity; zeroit(identity); + sobj ret ; + + Integer nsimd= vobj::Nsimd(); + + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { + auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>()); + cgh.parallel_for(cl::sycl::range<1>{osites}, + Reduction, + [=] (cl::sycl::id<1> item, auto &sum) { + auto osite = item[0]; + sum +=Reduce(lat[osite]); + }); + }); + theGridAccelerator->wait(); + ret = mysum[0]; + free(mysum,*theGridAccelerator); + sobjD dret; convertType(dret,ret); + return dret; +} + +template +inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites) +{ + return sumD_gpu_tensor(lat,osites); +} +template +inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) +{ + return sumD_gpu_large(lat,osites); +} + +template +inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) +{ + return sumD_gpu_large(lat,osites); +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +// Return as same precision as input performing reduction in double precision though +///////////////////////////////////////////////////////////////////////////////////////////////////////// +template +inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) +{ + typedef typename vobj::scalar_object sobj; + sobj result; + result = sumD_gpu(lat,osites); + return result; +} + +template +inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites) +{ + typedef typename vobj::scalar_object sobj; + sobj result; + result = sumD_gpu_large(lat,osites); + return result; +} + +NAMESPACE_END(Grid); + +/* +template Double svm_reduce(Double *vec,uint64_t L) +{ + Double sumResult; zeroit(sumResult); + Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator); + Double identity; zeroit(identity); + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { + auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>()); + cgh.parallel_for(cl::sycl::range<1>{L}, + Reduction, + [=] (cl::sycl::id<1> index, auto &sum) { + sum +=vec[index]; + }); + }); + theGridAccelerator->wait(); + Double ret = d_sum[0]; + free(d_sum,*theGridAccelerator); + std::cout << " svm_reduce finished "<wait(); } +#define accelerator_barrier(dummy) { theGridAccelerator->wait(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; -inline void acceleratorCopySynchronise(void) { printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); } +inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); } inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} diff --git a/benchmarks/Benchmark_halo.cc b/benchmarks/Benchmark_halo.cc new file mode 100644 index 00000000..43138e67 --- /dev/null +++ b/benchmarks/Benchmark_halo.cc @@ -0,0 +1,131 @@ + /************************************************************************************* + Grid physics library, www.github.com/paboyle/Grid + Source file: ./benchmarks/Benchmark_dwf.cc + Copyright (C) 2015 + + Author: Peter Boyle + Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#ifdef GRID_CUDA +#define CUDA_PROFILE +#endif + +#ifdef CUDA_PROFILE +#include +#endif + +using namespace std; +using namespace Grid; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + Coordinate latt4= GridDefaultLatt(); + Coordinate mpi = GridDefaultMpi(); + Coordinate simd = GridDefaultSimd(Nd,vComplexF::Nsimd()); + + GridLogLayout(); + + int Ls=16; + for(int i=0;i> Ls; + } + + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4,simd ,mpi); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making s innermost grids"< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionF src (FGrid); random(RNG5,src); + RealD N2 = 1.0/::sqrt(norm2(src)); + src = src*N2; + + std::cout << GridLogMessage << "Drawing gauge field" << std::endl; + LatticeGaugeFieldF Umu(UGrid); + SU::HotConfiguration(RNG4,Umu); + std::cout << GridLogMessage << "Random gauge initialised " << std::endl; + + RealD mass=0.1; + RealD M5 =1.8; + + RealD NP = UGrid->_Nprocessors; + RealD NN = UGrid->NodeCount(); + + DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + const int ncall = 500; + std::cout << GridLogMessage<< "*********************************************************" <Barrier(); + Dw.Stencil.HaloExchangeOptGather(src,compressor); + double t0=usecond(); + for(int i=0;iBarrier(); + + double bytes=0.0; + if(mpi[0]) bytes+=latt4[1]*latt4[2]*latt4[3]; + if(mpi[1]) bytes+=latt4[0]*latt4[2]*latt4[3]; + if(mpi[2]) bytes+=latt4[0]*latt4[1]*latt4[3]; + if(mpi[3]) bytes+=latt4[0]*latt4[1]*latt4[2]; + bytes = bytes * Ls * 8.* (24.+12.)* 2.0; + + std::cout< 1tile.log +for i in 0 +do +mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768 +mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768 +done +#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.1x2.log +#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.2x1.log -mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 2tile.log diff --git a/systems/PVC/benchmarks/wrap.sh b/systems/PVC/benchmarks/wrap.sh index a352fff9..bb7b517d 100755 --- a/systems/PVC/benchmarks/wrap.sh +++ b/systems/PVC/benchmarks/wrap.sh @@ -7,8 +7,8 @@ echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK if [ $MPI_LOCALRANKID = "0" ] then -# ~psteinbr/build_pti/ze_tracer -c $@ - onetrace --chrome-kernel-timeline $@ +# ~psteinbr/build_pti/ze_tracer -h $@ + onetrace --chrome-device-timeline $@ else $@ fi diff --git a/systems/PVC/config-command b/systems/PVC/config-command index 3f5b5993..cd7bba1d 100644 --- a/systems/PVC/config-command +++ b/systems/PVC/config-command @@ -2,14 +2,15 @@ INSTALL=/nfs/site/home/azusayax/install ../../configure \ --enable-simd=GPU \ --enable-gen-simd-width=64 \ - --enable-comms=mpi \ + --enable-comms=mpi-auto \ --disable-accelerator-cshift \ --disable-gparity \ --disable-fermion-reps \ --enable-shm=nvlink \ --enable-accelerator=sycl \ - --enable-unified=yes \ - CXX=mpicxx \ + --enable-unified=no \ + MPICXX=mpicxx \ + CXX=dpcpp \ LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \ - CXXFLAGS="-cxx=dpcpp -fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare" + CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare"