From 0b905a72ddfafcf02bcc4b6738ff31c74be79ed5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 29 Oct 2021 02:22:22 +0100 Subject: [PATCH 01/10] Better reduction for GPUs --- Grid/lattice/Lattice_reduction_gpu.h | 48 ++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index c2875052..823e497e 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -23,7 +23,7 @@ unsigned int nextPow2(Iterator x) { } template -void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { +int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { int device; #ifdef GRID_CUDA @@ -37,13 +37,13 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock; Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock; Iterator multiProcessorCount = gpu_props[device].multiProcessorCount; - + /* std::cout << GridLogDebug << "GPU has:" << std::endl; std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl; std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl; std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl; std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl; - + */ if (warpSize != WARP_SIZE) { std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl; exit(EXIT_FAILURE); @@ -53,12 +53,12 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator threads = warpSize; if ( threads*sizeofsobj > sharedMemPerBlock ) { std::cout << GridLogError << "The object is too large for the shared memory." << std::endl; - exit(EXIT_FAILURE); + return 0; } while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2; // keep all the streaming multiprocessors busy blocks = nextPow2(multiProcessorCount); - + return 1; } template @@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) { // Possibly promote to double and sum ///////////////////////////////////////////////////////////////////////////////////////////////////////// template -inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) +inline typename vobj::scalar_objectD sumD_gpu_internal(const vobj *lat, Integer osites) { typedef typename vobj::scalar_objectD sobj; typedef decltype(lat) Iterator; @@ -208,6 +208,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) Integer numThreads, numBlocks; getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); + Integer smemSize = numThreads * sizeof(sobj); Vector buffer(numBlocks); @@ -218,6 +219,41 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) auto result = buffer_v[0]; return result; } +template +inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) +{ + typedef typename vobj::vector_type vector; + typedef typename vobj::scalar_typeD scalarD; + typedef typename vobj::scalar_objectD sobj; + sobj ret; + scalarD *ret_p = (scalarD *)&ret; + + const int words = sizeof(vobj)/sizeof(vector); + + Integer nsimd= vobj::Nsimd(); + Integer size = osites*nsimd; + Integer numThreads, numBlocks; + int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); + + if ( ok ) { + ret = sumD_gpu_internal(lat,osites); + } else { + std::cout << GridLogWarning << " dropping to summing word by word for large object size "< buffer(osites); + vector *dat = (vector *)lat; + vector *buf = &buffer[0]; + iScalar *tbuf =(iScalar *) &buffer[0]; + for(int w=0;w Date: Fri, 29 Oct 2021 02:23:08 +0100 Subject: [PATCH 02/10] Verbosity --- Grid/lattice/Lattice_reduction_gpu.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 823e497e..73a704f5 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -238,7 +238,6 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) if ( ok ) { ret = sumD_gpu_internal(lat,osites); } else { - std::cout << GridLogWarning << " dropping to summing word by word for large object size "< buffer(osites); vector *dat = (vector *)lat; vector *buf = &buffer[0]; From e8c187b3233aa6a5a02e4a1a31a3e83d629b5f2b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Feb 2022 11:24:38 -0500 Subject: [PATCH 03/10] SyCL happier? --- Grid/qcd/action/fermion/WilsonCloverHelpers.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCloverHelpers.h b/Grid/qcd/action/fermion/WilsonCloverHelpers.h index 588525cc..60f19317 100644 --- a/Grid/qcd/action/fermion/WilsonCloverHelpers.h +++ b/Grid/qcd/action/fermion/WilsonCloverHelpers.h @@ -726,8 +726,8 @@ public: static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) { conformable(f, m); auto grid = f.Grid(); - const int Nsite = grid->oSites(); - const int Nsimd = grid->Nsimd(); + const uint32_t Nsite = grid->oSites(); + const uint32_t Nsimd = grid->Nsimd(); autoView(f_v, f, AcceleratorWrite); autoView(m_v, m, AcceleratorRead); // NOTE: this function cannot be 'private' since nvcc forbids this for kernels From 63dbaeefaa533717379811f4695a149a82e2d6ec Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Feb 2022 14:01:43 +0000 Subject: [PATCH 04/10] Extra barrier prior to finalize just in case it fixes an issue on Tursa --- Grid/util/Init.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 6992129e..36854d9c 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -534,6 +534,7 @@ void Grid_init(int *argc,char ***argv) void Grid_finalize(void) { #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) + MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); Grid_unquiesce_nodes(); #endif From 3e882f555dcb75a29e640c6c70fd7e7f23973b73 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 1 Mar 2022 08:54:45 -0500 Subject: [PATCH 05/10] Large / small sumD options --- Grid/lattice/Lattice_reduction.h | 9 +++++ Grid/lattice/Lattice_reduction_gpu.h | 53 ++++++++++++++++++++-------- 2 files changed, 47 insertions(+), 15 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 326b9ea3..c3478ab4 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -142,6 +142,15 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) return sumD_cpu(arg,osites); #endif } +template +inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) +{ +#if defined(GRID_CUDA)||defined(GRID_HIP) + return sumD_gpu_large(arg,osites); +#else + return sumD_cpu(arg,osites); +#endif +} template inline typename vobj::scalar_object sum(const Lattice &arg) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index 73a704f5..c685a2c0 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -198,7 +198,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) { // Possibly promote to double and sum ///////////////////////////////////////////////////////////////////////////////////////////////////////// template -inline typename vobj::scalar_objectD sumD_gpu_internal(const vobj *lat, Integer osites) +inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) { typedef typename vobj::scalar_objectD sobj; typedef decltype(lat) Iterator; @@ -207,7 +207,8 @@ inline typename vobj::scalar_objectD sumD_gpu_internal(const vobj *lat, Integer Integer size = osites*nsimd; Integer numThreads, numBlocks; - getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); + int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); + assert(ok); Integer smemSize = numThreads * sizeof(sobj); @@ -219,6 +220,37 @@ inline typename vobj::scalar_objectD sumD_gpu_internal(const vobj *lat, Integer auto result = buffer_v[0]; return result; } + +template +inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites) +{ + typedef typename vobj::vector_type vector; + typedef typename vobj::scalar_typeD scalarD; + typedef typename vobj::scalar_objectD sobj; + sobj ret; + scalarD *ret_p = (scalarD *)&ret; + + const int words = sizeof(vobj)/sizeof(vector); + + Integer nsimd= vobj::Nsimd(); + Integer size = osites*nsimd; + Integer numThreads, numBlocks; + + Vector buffer(osites); + vector *dat = (vector *)lat; + vector *buf = &buffer[0]; + iScalar *tbuf =(iScalar *) &buffer[0]; + for(int w=0;w inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) { @@ -236,23 +268,14 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); if ( ok ) { - ret = sumD_gpu_internal(lat,osites); + ret = sumD_gpu_small(lat,osites); } else { - Vector buffer(osites); - vector *dat = (vector *)lat; - vector *buf = &buffer[0]; - iScalar *tbuf =(iScalar *) &buffer[0]; - for(int w=0;w Date: Tue, 1 Mar 2022 10:53:44 -0500 Subject: [PATCH 06/10] Configure for mac arm --- systems/mac-arm/config-command-mpi | 1 + 1 file changed, 1 insertion(+) create mode 100644 systems/mac-arm/config-command-mpi diff --git a/systems/mac-arm/config-command-mpi b/systems/mac-arm/config-command-mpi new file mode 100644 index 00000000..d1e75c39 --- /dev/null +++ b/systems/mac-arm/config-command-mpi @@ -0,0 +1 @@ +CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi From e16fc5b2e4dd2d507c8c92a05f987a8aa1ec5c3e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 1 Mar 2022 11:17:24 -0500 Subject: [PATCH 07/10] Threaded intranode comms transfer - ideally between NUMA domains --- Grid/threads/Accelerator.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index b427b304..12483185 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -481,9 +481,10 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); #define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ }); accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} + +inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { GridThread::bcopy(from,to,bytes);} +inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ GridThread::bcopy(from,to,bytes);} +inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { GridThread::bcopy(from,to,bytes);} inline void acceleratorCopySynchronise(void) {}; inline int acceleratorIsCommunicable(void *ptr){ return 1; } From d4ae71b8806e535145a718a9e1e00a7315aaf5bc Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Wed, 2 Mar 2022 15:40:18 +0000 Subject: [PATCH 08/10] sum_gpu_large and sum_gpu templates added. --- Grid/lattice/Lattice_reduction.h | 16 ++++++++++++++++ Grid/lattice/Lattice_reduction_gpu.h | 8 ++++++++ 2 files changed, 24 insertions(+) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index c3478ab4..0ddac437 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -168,6 +168,22 @@ inline typename vobj::scalar_object sum(const Lattice &arg) return ssum; } +template +inline typename vobj::scalar_object sum_large(const Lattice &arg) +{ +#if defined(GRID_CUDA)||defined(GRID_HIP) + autoView( arg_v, arg, AcceleratorRead); + Integer osites = arg.Grid()->oSites(); + auto ssum= sum_gpu_large(&arg_v[0],osites); +#else + autoView(arg_v, arg, CpuRead); + Integer osites = arg.Grid()->oSites(); + auto ssum= sum_cpu(&arg_v[0],osites); +#endif + arg.Grid()->GlobalSum(ssum); + return ssum; +} + //////////////////////////////////////////////////////////////////////////////////////////////////// // Deterministic Reduction operations //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index c685a2c0..c3422af3 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -288,6 +288,14 @@ inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) return result; } +template +inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites) +{ + typedef typename vobj::scalar_object sobj; + sobj result; + result = sumD_gpu_large(lat,osites); + return result; +} NAMESPACE_END(Grid); From d1decee4cce0b9a9f02bb5521cb06840b20022ad Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Wed, 2 Mar 2022 16:54:23 +0000 Subject: [PATCH 09/10] Cleaned up unused variables in Lattice_reduction_gpu.h --- Grid/lattice/Lattice_reduction_gpu.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index c3422af3..bad86d2a 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -232,10 +232,6 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi const int words = sizeof(vobj)/sizeof(vector); - Integer nsimd= vobj::Nsimd(); - Integer size = osites*nsimd; - Integer numThreads, numBlocks; - Vector buffer(osites); vector *dat = (vector *)lat; vector *buf = &buffer[0]; @@ -258,10 +254,7 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) typedef typename vobj::scalar_typeD scalarD; typedef typename vobj::scalar_objectD sobj; sobj ret; - scalarD *ret_p = (scalarD *)&ret; - const int words = sizeof(vobj)/sizeof(vector); - Integer nsimd= vobj::Nsimd(); Integer size = osites*nsimd; Integer numThreads, numBlocks; @@ -275,7 +268,6 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) return ret; } - ///////////////////////////////////////////////////////////////////////////////////////////////////////// // Return as same precision as input performing reduction in double precision though ///////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -297,5 +289,4 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite return result; } - NAMESPACE_END(Grid); From 92a83a9eb33c54467b29ff557bad4aed90c4f65e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Mar 2022 17:14:36 +0000 Subject: [PATCH 10/10] Performance improve for Tesseract --- Grid/threads/Accelerator.h | 6 +++--- Grid/threads/Threads.h | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 12483185..389f2cc4 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -482,9 +482,9 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific -inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { GridThread::bcopy(from,to,bytes);} -inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ GridThread::bcopy(from,to,bytes);} -inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { GridThread::bcopy(from,to,bytes);} +inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); } +inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);} +inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);} inline void acceleratorCopySynchronise(void) {}; inline int acceleratorIsCommunicable(void *ptr){ return 1; } diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h index a9fa13ea..6887134d 100644 --- a/Grid/threads/Threads.h +++ b/Grid/threads/Threads.h @@ -72,3 +72,20 @@ Author: paboyle #define thread_region DO_PRAGMA(omp parallel) #define thread_critical DO_PRAGMA(omp critical) +#ifdef GRID_OMP +inline void thread_bcopy(void *from, void *to,size_t bytes) +{ + uint64_t *ufrom = (uint64_t *)from; + uint64_t *uto = (uint64_t *)to; + assert(bytes%8==0); + uint64_t words=bytes/8; + thread_for(w,words,{ + uto[w] = ufrom[w]; + }); +} +#else +inline void thread_bcopy(void *from, void *to,size_t bytes) +{ + bcopy(from,to,bytes); +} +#endif