diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 0ddac437..fb6a258c 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -28,6 +28,9 @@ Author: Christoph Lehner #if defined(GRID_CUDA)||defined(GRID_HIP) #include #endif +#if defined(GRID_SYCL) +#include +#endif NAMESPACE_BEGIN(Grid); @@ -127,7 +130,7 @@ inline Double max(const Double *arg, Integer osites) template inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) { -#if defined(GRID_CUDA)||defined(GRID_HIP) +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) return sum_gpu(arg,osites); #else return sum_cpu(arg,osites); @@ -136,7 +139,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) template inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) { -#if defined(GRID_CUDA)||defined(GRID_HIP) +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) return sumD_gpu(arg,osites); #else return sumD_cpu(arg,osites); @@ -145,7 +148,7 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) template inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) { -#if defined(GRID_CUDA)||defined(GRID_HIP) +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) return sumD_gpu_large(arg,osites); #else return sumD_cpu(arg,osites); @@ -155,13 +158,13 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) template inline typename vobj::scalar_object sum(const Lattice &arg) { -#if defined(GRID_CUDA)||defined(GRID_HIP) - autoView( arg_v, arg, AcceleratorRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum_gpu(&arg_v[0],osites); +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) + typename vobj::scalar_object ssum; + autoView( arg_v, arg, AcceleratorRead); + ssum= sum_gpu(&arg_v[0],osites); #else autoView(arg_v, arg, CpuRead); - Integer osites = arg.Grid()->oSites(); auto ssum= sum_cpu(&arg_v[0],osites); #endif arg.Grid()->GlobalSum(ssum); @@ -171,7 +174,7 @@ inline typename vobj::scalar_object sum(const Lattice &arg) template inline typename vobj::scalar_object sum_large(const Lattice &arg) { -#if defined(GRID_CUDA)||defined(GRID_HIP) +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) autoView( arg_v, arg, AcceleratorRead); Integer osites = arg.Grid()->oSites(); auto ssum= sum_gpu_large(&arg_v[0],osites); @@ -235,11 +238,10 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & typedef decltype(innerProductD(vobj(),vobj())) inner_t; Vector inner_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; - { autoView( left_v , left, AcceleratorRead); autoView( right_v,right, AcceleratorRead); - + // This code could read coalesce // GPU - SIMT lane compliance... accelerator_for( ss, sites, 1,{ auto x_l = left_v[ss]; diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h new file mode 100644 index 00000000..90980c4c --- /dev/null +++ b/Grid/lattice/Lattice_reduction_sycl.h @@ -0,0 +1,125 @@ +NAMESPACE_BEGIN(Grid); + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +// Possibly promote to double and sum +///////////////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_objectD sobjD; + sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator); + sobj identity; zeroit(identity); + sobj ret ; + + Integer nsimd= vobj::Nsimd(); + + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { + auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>()); + cgh.parallel_for(cl::sycl::range<1>{osites}, + Reduction, + [=] (cl::sycl::id<1> item, auto &sum) { + auto osite = item[0]; + sum +=Reduce(lat[osite]); + }); + }); + theGridAccelerator->wait(); + ret = mysum[0]; + free(mysum,*theGridAccelerator); + sobjD dret; convertType(dret,ret); + return dret; +} + +template +inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites) +{ + return sumD_gpu_tensor(lat,osites); +} +template +inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) +{ + return sumD_gpu_large(lat,osites); +} + +template +inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) +{ + return sumD_gpu_large(lat,osites); +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +// Return as same precision as input performing reduction in double precision though +///////////////////////////////////////////////////////////////////////////////////////////////////////// +template +inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) +{ + typedef typename vobj::scalar_object sobj; + sobj result; + result = sumD_gpu(lat,osites); + return result; +} + +template +inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites) +{ + typedef typename vobj::scalar_object sobj; + sobj result; + result = sumD_gpu_large(lat,osites); + return result; +} + +NAMESPACE_END(Grid); + +/* +template Double svm_reduce(Double *vec,uint64_t L) +{ + Double sumResult; zeroit(sumResult); + Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator); + Double identity; zeroit(identity); + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { + auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>()); + cgh.parallel_for(cl::sycl::range<1>{L}, + Reduction, + [=] (cl::sycl::id<1> index, auto &sum) { + sum +=vec[index]; + }); + }); + theGridAccelerator->wait(); + Double ret = d_sum[0]; + free(d_sum,*theGridAccelerator); + std::cout << " svm_reduce finished "< dw.2tile.1x2.log -mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > dw.2tile.2x1.log - -mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.1x2.log -mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.2x1.log +for i in 0 +do +mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768 +mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768 +done +#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.1x2.log +#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.2x1.log diff --git a/systems/PVC/config-command b/systems/PVC/config-command index 3f5b5993..cd7bba1d 100644 --- a/systems/PVC/config-command +++ b/systems/PVC/config-command @@ -2,14 +2,15 @@ INSTALL=/nfs/site/home/azusayax/install ../../configure \ --enable-simd=GPU \ --enable-gen-simd-width=64 \ - --enable-comms=mpi \ + --enable-comms=mpi-auto \ --disable-accelerator-cshift \ --disable-gparity \ --disable-fermion-reps \ --enable-shm=nvlink \ --enable-accelerator=sycl \ - --enable-unified=yes \ - CXX=mpicxx \ + --enable-unified=no \ + MPICXX=mpicxx \ + CXX=dpcpp \ LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \ - CXXFLAGS="-cxx=dpcpp -fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare" + CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare"