From 6153dec2e40dc711ce2eccd49b1f61e8704d1d69 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 5 Mar 2024 13:38:32 -0500 Subject: [PATCH 01/17] Update setup.sh --- systems/PVC-OEM/setup.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/systems/PVC-OEM/setup.sh b/systems/PVC-OEM/setup.sh index 3b8188f0..0e780ef4 100644 --- a/systems/PVC-OEM/setup.sh +++ b/systems/PVC-OEM/setup.sh @@ -1,3 +1,5 @@ export https_proxy=http://proxy-chain.intel.com:911 module load intel-release module load intel/mpich +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" From 2ae980ae439b92285341ec5777c9c4b5ec293547 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 5 Mar 2024 13:39:18 -0500 Subject: [PATCH 02/17] Update sourceme.sh --- systems/Aurora/sourceme.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index 7a2b3815..60abed41 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -9,4 +9,5 @@ export http_proxy=http://proxy.alcf.anl.gov:3128 export https_proxy=http://proxy.alcf.anl.gov:3128 #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 git config --global http.proxy http://proxy.alcf.anl.gov:3128 - + +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" From 30228214f737c002ca2c5636b73bc23af59a88ae Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 5 Mar 2024 23:56:10 +0000 Subject: [PATCH 03/17] SYCL conflict with Eigen --- Grid/Grid_Eigen_Dense.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index bdd39a65..8bd1d113 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -34,7 +34,7 @@ #pragma push_macro("__SYCL_DEVICE_ONLY__") #undef __SYCL_DEVICE_ONLY__ #define EIGEN_DONT_VECTORIZE -//#undef EIGEN_USE_SYCL +#undef EIGEN_USE_SYCL #define __SYCL__REDEFINE__ #endif From 21bc8c24df800a480c3d36c5a135f9512658631f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 5 Mar 2024 23:58:20 +0000 Subject: [PATCH 04/17] OneMKL batched blas starting --- Grid/algorithms/blas/BatchedBlas.h | 40 ++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index 2924350d..5ab46333 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -34,9 +34,14 @@ Author: Peter Boyle #include #endif #ifdef GRID_SYCL -#error // need oneMKL version +#include +#endif +#if 0 +#define GRID_ONE_MKL +#endif +#ifdef GRID_ONE_MKL +#include #endif - /////////////////////////////////////////////////////////////////////// // Need to rearrange lattice data to be in the right format for a // batched multiply. Might as well make these static, dense packed @@ -49,9 +54,12 @@ NAMESPACE_BEGIN(Grid); typedef cudablasHandle_t gridblasHandle_t; #endif #ifdef GRID_SYCL - typedef int32_t gridblasHandle_t; + typedef cl::sycl::queue *gridblasHandle_t; #endif -#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) +#ifdef GRID_ONE_MKL + typedef cl::sycl::queue *gridblasHandle_t; +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) typedef int32_t gridblasHandle_t; #endif @@ -76,6 +84,12 @@ public: hipblasCreate(&gridblasHandle); #endif #ifdef GRID_SYCL + gridblasHandle = theGridAccelerator; +#endif +#ifdef GRID_ONE_MKL + cl::sycl::cpu_selector selector; + cl::sycl::device selectedDevice { selector }; + gridblasHandle =new sycl::queue (selectedDevice); #endif gridblasInit=1; } @@ -110,6 +124,9 @@ public: #endif #ifdef GRID_SYCL accelerator_barrier(); +#endif +#ifdef GRID_ONE_MKL + gridblasHandle->wait(); #endif } @@ -644,10 +661,19 @@ public: (cuDoubleComplex *) Cmn, ldc, sdc, batchCount); #endif -#ifdef GRID_SYCL - #warning "oneMKL implementation not made " +#if defined(GRID_SYCL) || defined(GRID_ONE_MKL) + oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, + oneapi::mkl::transpose::N, + oneapi::mkl::transpose::N, + m,n,k, + alpha, + (const ComplexD *)Amk,lda,sda, + (const ComplexD *)Bkn,ldb,sdb, + beta, + (ComplexD *)Cmn,ldc,sdc, + batchCount); #endif -#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) // Need a default/reference implementation for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { From f8ca971daedc087fbb1f554705d27d6af44eef6f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 5 Mar 2024 23:59:13 +0000 Subject: [PATCH 05/17] Use of a bare PRECISION macro is not namespace safe and collides with SYCL --- Grid/algorithms/approx/Zolotarev.cc | 96 ++++++++++++++--------------- Grid/algorithms/approx/Zolotarev.h | 11 ++-- 2 files changed, 54 insertions(+), 53 deletions(-) diff --git a/Grid/algorithms/approx/Zolotarev.cc b/Grid/algorithms/approx/Zolotarev.cc index c2efd41c..47779eae 100644 --- a/Grid/algorithms/approx/Zolotarev.cc +++ b/Grid/algorithms/approx/Zolotarev.cc @@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k, * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and * type = 1 for the approximation which is infinite at x = 0. */ -zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) { +zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) { INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F, l, invlambda, xi, xisq, *tv, s, opl; int m, czero, ts; @@ -375,12 +375,12 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) { construct_partfrac(d); construct_contfrac(d); - /* Converting everything to PRECISION for external use only */ + /* Converting everything to ZOLO_PRECISION for external use only */ zd = (zolotarev_data*) malloc(sizeof(zolotarev_data)); - zd -> A = (PRECISION) d -> A; - zd -> Delta = (PRECISION) d -> Delta; - zd -> epsilon = (PRECISION) d -> epsilon; + zd -> A = (ZOLO_PRECISION) d -> A; + zd -> Delta = (ZOLO_PRECISION) d -> Delta; + zd -> epsilon = (ZOLO_PRECISION) d -> epsilon; zd -> n = d -> n; zd -> type = d -> type; zd -> dn = d -> dn; @@ -390,24 +390,24 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) { zd -> deg_num = d -> deg_num; zd -> deg_denom = d -> deg_denom; - zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION)); - for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m]; + zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m]; free(d -> a); - zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION)); - for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m]; + zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m]; free(d -> ap); - zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION)); - for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m]; + zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m]; free(d -> alpha); - zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION)); - for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m]; + zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m]; free(d -> beta); - zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION)); - for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m]; + zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m]; free(d -> gamma); free(d); @@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata) } -zolotarev_data* higham(PRECISION epsilon, int n) { +zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) { INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq; int m, czero; zolotarev_data *zd; @@ -481,9 +481,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) { /* Converting everything to PRECISION for external use only */ zd = (zolotarev_data*) malloc(sizeof(zolotarev_data)); - zd -> A = (PRECISION) d -> A; - zd -> Delta = (PRECISION) d -> Delta; - zd -> epsilon = (PRECISION) d -> epsilon; + zd -> A = (ZOLO_PRECISION) d -> A; + zd -> Delta = (ZOLO_PRECISION) d -> Delta; + zd -> epsilon = (ZOLO_PRECISION) d -> epsilon; zd -> n = d -> n; zd -> type = d -> type; zd -> dn = d -> dn; @@ -493,24 +493,24 @@ zolotarev_data* higham(PRECISION epsilon, int n) { zd -> deg_num = d -> deg_num; zd -> deg_denom = d -> deg_denom; - zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION)); - for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m]; + zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m]; free(d -> a); - zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION)); - for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m]; + zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m]; free(d -> ap); - zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION)); - for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m]; + zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m]; free(d -> alpha); - zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION)); - for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m]; + zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m]; free(d -> beta); - zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION)); - for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m]; + zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m]; free(d -> gamma); free(d); @@ -523,17 +523,17 @@ NAMESPACE_END(Grid); #ifdef TEST #undef ZERO -#define ZERO ((PRECISION) 0) +#define ZERO ((ZOLO_PRECISION) 0) #undef ONE -#define ONE ((PRECISION) 1) +#define ONE ((ZOLO_PRECISION) 1) #undef TWO -#define TWO ((PRECISION) 2) +#define TWO ((ZOLO_PRECISION) 2) /* Evaluate the rational approximation R(x) using the factored form */ -static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION R; + ZOLO_PRECISION R; if (rdata -> type == 0) { R = rdata -> A * x; @@ -551,9 +551,9 @@ static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) { /* Evaluate the rational approximation R(x) using the partial fraction form */ -static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION R = rdata -> alpha[rdata -> da - 1]; + ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1]; for (m = 0; m < rdata -> dd; m++) R += rdata -> alpha[m] / (x * x - rdata -> ap[m]); if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x); @@ -568,18 +568,18 @@ static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) { * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0, * but with signalling overflow you will get an error message. */ -static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION R = rdata -> beta[0] * x; + ZOLO_PRECISION R = rdata -> beta[0] * x; for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R; return R; } /* Evaluate the rational approximation R(x) using Cayley form */ -static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION T; + ZOLO_PRECISION T; T = rdata -> type == 0 ? ONE : -ONE; for (m = 0; m < rdata -> n; m++) @@ -607,7 +607,7 @@ int main(int argc, char** argv) { int m, n, plotpts = 5000, type = 0; float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr; zolotarev_data *rdata; - PRECISION y; + ZOLO_PRECISION y; FILE *plot_function, *plot_error, *plot_partfrac, *plot_contfrac, *plot_cayley; @@ -626,13 +626,13 @@ int main(int argc, char** argv) { } rdata = type == 2 - ? higham((PRECISION) eps, n) - : zolotarev((PRECISION) eps, n, type); + ? higham((ZOLO_PRECISION) eps, n) + : zolotarev((ZOLO_PRECISION) eps, n, type); printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION) "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION) - "\tPRECISION = " STRINGIFY(PRECISION) + "\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION) "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n" "\tDelta = %g (maximum error)\n\n" "\tA = %g (overall factor)\n", @@ -681,15 +681,15 @@ int main(int argc, char** argv) { x = 2.4 * (float) m / plotpts - 1.2; if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) { /* skip x = 0 for type 1, as R(0) is singular */ - y = zolotarev_eval((PRECISION) x, rdata); + y = zolotarev_eval((ZOLO_PRECISION) x, rdata); fprintf(plot_function, "%g %g\n", x, (float) y); fprintf(plot_error, "%g %g\n", x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta)); - ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y) + ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y) / rdata -> Delta); - ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y) + ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y) / rdata -> Delta); - ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y) + ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y) / rdata -> Delta); if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) { maxypferr = MAX(maxypferr, fabs(ypferr)); diff --git a/Grid/algorithms/approx/Zolotarev.h b/Grid/algorithms/approx/Zolotarev.h index 800cf3c7..3c983cd3 100644 --- a/Grid/algorithms/approx/Zolotarev.h +++ b/Grid/algorithms/approx/Zolotarev.h @@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx); #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY> #ifndef ZOLOTAREV_INTERNAL -#ifndef PRECISION -#define PRECISION double +#ifndef ZOLO_PRECISION +#define ZOLO_PRECISION double #endif -#define ZPRECISION PRECISION +#define ZPRECISION ZOLO_PRECISION #define ZOLOTAREV_DATA zolotarev_data #endif @@ -77,8 +77,8 @@ typedef struct { * zolotarev_data structure. The arguments must satisfy the constraints that * epsilon > 0, n > 0, and type = 0 or 1. */ -ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ; -ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type); +ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ; +ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type); void zolotarev_free(zolotarev_data *zdata); #endif @@ -86,3 +86,4 @@ void zolotarev_free(zolotarev_data *zdata); NAMESPACE_END(Approx); NAMESPACE_END(Grid); #endif + From 976c3e9b59a10139611886b1cf97e40e3470cbfd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 5 Mar 2024 23:59:57 +0000 Subject: [PATCH 06/17] Hack for flight logging CG inner products. Can be made to work, but could put in some more serious infrastructure for repro testing and blame attribution (Britney test) if necessary --- Grid/lattice/Lattice.h | 2 +- Grid/lattice/Lattice_crc.h | 4 ++-- Grid/lattice/Lattice_reduction.h | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index 6343db99..79572949 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -35,6 +35,7 @@ Author: Peter Boyle #include #include #include +#include #include #include #include @@ -46,5 +47,4 @@ Author: Peter Boyle #include #include #include -#include #include diff --git a/Grid/lattice/Lattice_crc.h b/Grid/lattice/Lattice_crc.h index 142e2349..e31d8441 100644 --- a/Grid/lattice/Lattice_crc.h +++ b/Grid/lattice/Lattice_crc.h @@ -42,13 +42,13 @@ template void DumpSliceNorm(std::string s,Lattice &f,int mu=-1 } } -template uint32_t crc(Lattice & buf) +template uint32_t crc(const Lattice & buf) { autoView( buf_v , buf, CpuRead); return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites()); } -#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "< inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { GridBase *grid = left.Grid(); ComplexD nrm = rankInnerProduct(left,right); + // std::cerr<<"flight log " << std::hexfloat << nrm <<" "<GlobalSum(nrm); return nrm; } From 783a66b3485169888000cdb735c34617126f47f3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 00:01:37 +0000 Subject: [PATCH 07/17] Deterministic reduction please --- Grid/threads/Accelerator.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index f6efdee9..392cba61 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -255,17 +255,13 @@ inline int acceleratorIsCommunicable(void *ptr) #define GRID_SYCL_LEVEL_ZERO_IPC NAMESPACE_END(Grid); -#if 0 -#include -#include -#include -#include -#else + +// Force deterministic reductions +#define SYCL_REDUCTION_DETERMINISTIC #include #include #include #include -#endif NAMESPACE_BEGIN(Grid); From 1b93a9be88cad5d456a84fafdda4104d4213ff82 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 00:01:58 +0000 Subject: [PATCH 08/17] Print out the hostname --- Grid/util/Init.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index d013763a..9a0b4376 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -393,6 +393,9 @@ void Grid_init(int *argc,char ***argv) std::cout << GridLogMessage << "MPI is initialised and logging filters activated "< Date: Wed, 6 Mar 2024 00:02:27 +0000 Subject: [PATCH 09/17] SPR HBM benchmarking right and also PVC batched GEMM --- benchmarks/Benchmark_usqcd.cc | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc index eaa78e40..3b729b9e 100644 --- a/benchmarks/Benchmark_usqcd.cc +++ b/benchmarks/Benchmark_usqcd.cc @@ -219,7 +219,7 @@ public: uint64_t NN; - uint64_t lmax=32; + uint64_t lmax=40; #define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat) GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); @@ -454,11 +454,17 @@ public: pickCheckerboard(Even,src_e,src); pickCheckerboard(Odd,src_o,src); - const int num_cases = 1; +#ifdef AVX512 + const int num_cases = 3; +#else + const int num_cases = 2; +#endif std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); controls Cases [] = { - { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent } + { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent } }; for(int c=0;c({8,2,2,2}); -#else LebesgueOrder::Block = std::vector({2,2,2,2}); -#endif + Benchmark::Decomposition(); int do_su4=0; @@ -910,7 +919,7 @@ int main (int argc, char ** argv) } if ( do_blas ) { -#if defined(GRID_CUDA) || defined(GRID_HIP) +#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) std::cout< Date: Wed, 6 Mar 2024 00:03:16 +0000 Subject: [PATCH 10/17] Reproducing CG can be more useful now --- tests/Test_dwf_mixedcg_prec.cc | 111 +++++++++++++++++---------------- 1 file changed, 58 insertions(+), 53 deletions(-) diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index cbc573d1..13cc0bb6 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -30,27 +30,16 @@ Author: Peter Boyle using namespace std; using namespace Grid; -template -struct scal { - d internal; -}; - - Gamma::Algebra Gmu [] = { - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaT - }; - int main (int argc, char ** argv) { + char hostname[HOST_NAME_MAX+1]; + gethostname(hostname, HOST_NAME_MAX+1); + std::string host(hostname); + Grid_init(&argc,&argv); const int Ls=12; - std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl; - - { GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); @@ -92,7 +81,14 @@ int main (int argc, char ** argv) SchurDiagMooeeOperator HermOpEO(Ddwf); SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); - std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl; + int nsecs=600; + if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ + std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds"); + GridCmdOptionInt(arg,nsecs); + } + + std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "< mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO); double t1,t2,flops; double MdagMsiteflops = 1452; // Mobius (real coeffs) @@ -101,7 +97,14 @@ int main (int argc, char ** argv) std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<gSites()*iters; std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< CG(1.0e-8,10000); - for(int i=0;i<1;i++){ + csumref=0; + int i=0; + do { + std::cerr << "******************* DOUBLE PRECISION SOLVE "<gSites()*iters; flops+= CGsiteflops*FrbGrid->gSites()*iters; - + std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< munge; - std::string format = getFormatString(); - - BinaryIO::writeLatticeObject(result_o,file1,munge, 0, format, - nersc_csum,scidac_csuma,scidac_csumb); - - std::cout << GridLogMessage << " Mixed checksums "<(result_o_2,file1,munge, 0, format, - nersc_csum,scidac_csuma,scidac_csumb); - - std::cout << GridLogMessage << " CG checksums "< Date: Wed, 6 Mar 2024 00:03:59 +0000 Subject: [PATCH 11/17] More blasted shell variables --- systems/Aurora/benchmarks/bench1024.pbs | 10 +++++++--- systems/Aurora/benchmarks/bench12.pbs | 17 ++++++++++++++++- systems/Aurora/config-command | 4 ++-- systems/Aurora/sourceme.sh | 13 +++++++++++++ 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/systems/Aurora/benchmarks/bench1024.pbs b/systems/Aurora/benchmarks/bench1024.pbs index 88f0100a..2e99ae4b 100644 --- a/systems/Aurora/benchmarks/bench1024.pbs +++ b/systems/Aurora/benchmarks/bench1024.pbs @@ -25,12 +25,16 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 -export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 export MPICH_OFI_NIC_POLICY=GPU +export FI_CXI_CQ_FILL_PERCENT=10 +export FI_CXI_DEFAULT_CQ_SIZE=262144 +#export FI_CXI_DEFAULT_CQ_SIZE=131072 +#export FI_CXI_CQ_FILL_PERCENT=20 # 12 ppn, 32 nodes, 384 ranks # @@ -45,12 +49,12 @@ CMD="mpiexec -np 12288 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \ --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -$CMD | tee 1024node.dwf.small +$CMD | tee 1024node.dwf.small.cq CMD="mpiexec -np 12288 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \ --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" -$CMD | tee 1024node.dwf +$CMD | tee 1024node.dwf.cq diff --git a/systems/Aurora/benchmarks/bench12.pbs b/systems/Aurora/benchmarks/bench12.pbs index 96f6143f..ee3cb381 100644 --- a/systems/Aurora/benchmarks/bench12.pbs +++ b/systems/Aurora/benchmarks/bench12.pbs @@ -17,6 +17,7 @@ source ../sourceme.sh export OMP_NUM_THREADS=3 export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST @@ -35,11 +36,25 @@ CMD="mpiexec -np 24 -ppn 12 -envall \ ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \ --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" -$CMD +#$CMD CMD="mpiexec -np 24 -ppn 12 -envall \ ./gpu_tile_compact.sh \ ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \ --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" +#$CMD + +CMD="mpiexec -np 1 -ppn 1 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 1 -ppn 1 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + $CMD diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command index e59ef515..689747c9 100644 --- a/systems/Aurora/config-command +++ b/systems/Aurora/config-command @@ -11,6 +11,6 @@ TOOLS=$HOME/tools --enable-unified=no \ MPICXX=mpicxx \ CXX=icpx \ - LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \ - CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include" + LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \ + CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel" diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh index 7a2b3815..effb2d5d 100644 --- a/systems/Aurora/sourceme.sh +++ b/systems/Aurora/sourceme.sh @@ -3,6 +3,19 @@ module use /soft/modulefiles module load intel_compute_runtime/release/agama-devel-682.22 +export FI_CXI_DEFAULT_CQ_SIZE=131072 +export FI_CXI_CQ_FILL_PERCENT=20 + +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" +#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" + +# +# -ftarget-register-alloc-mode=pvc:default +# -ftarget-register-alloc-mode=pvc:small +# -ftarget-register-alloc-mode=pvc:large +# -ftarget-register-alloc-mode=pvc:auto +# + export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 export http_proxy=http://proxy.alcf.anl.gov:3128 From a46a0f088276de56a68d5020e9eee6875647c688 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 01:12:49 +0000 Subject: [PATCH 12/17] force device copyable and don't take crap from SYCL --- Grid/simd/Grid_vector_types.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index daf41cae..0a3d176f 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -1133,4 +1133,13 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc NAMESPACE_END(Grid); +#ifdef GRID_SYCL +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +#endif + + #endif From 10116b3be8730507876336b92490a82d39000f50 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 01:13:27 +0000 Subject: [PATCH 13/17] Force device copyable and tell SYCL to shut it. --- Grid/tensors/Tensor_traits.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h index 98bc3986..536e17f1 100644 --- a/Grid/tensors/Tensor_traits.h +++ b/Grid/tensors/Tensor_traits.h @@ -404,3 +404,12 @@ NAMESPACE_BEGIN(Grid); }; NAMESPACE_END(Grid); + +#ifdef GRID_SYCL +template struct +sycl::is_device_copyable::value && (!std::is_trivially_copyable::value), + void>::type> + : public std::true_type {}; +#endif + From 891a366f73a5dfe9a6822c8d884c9b22c82de971 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 01:22:55 +0000 Subject: [PATCH 14/17] Repro CG script --- systems/Aurora/tests/repro16.pbs | 40 ++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 systems/Aurora/tests/repro16.pbs diff --git a/systems/Aurora/tests/repro16.pbs b/systems/Aurora/tests/repro16.pbs new file mode 100644 index 00000000..28030a3d --- /dev/null +++ b/systems/Aurora/tests/repro16.pbs @@ -0,0 +1,40 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=16 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 16 nodes, 192 ranks +CMD="mpiexec -np 192 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \ + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000" +$CMD From b812a7b4c67e712ec1524fa1958c825057d1e27a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 01:32:40 +0000 Subject: [PATCH 15/17] Staggered launch script --- systems/Aurora/tests/solver/stag16.pbs | 40 ++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 systems/Aurora/tests/solver/stag16.pbs diff --git a/systems/Aurora/tests/solver/stag16.pbs b/systems/Aurora/tests/solver/stag16.pbs new file mode 100644 index 00000000..5bfe04a6 --- /dev/null +++ b/systems/Aurora/tests/solver/stag16.pbs @@ -0,0 +1,40 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=16 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 16 nodes, 192 ranks +CMD="mpiexec -np 192 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \ + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000" +$CMD From 228bbb9d81a45cd08a3c49cfbe4f3a911a15ac5e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 19:03:35 +0100 Subject: [PATCH 16/17] Benchmark results --- .../Booster/benchmarks/Benchmark_usqcd.csv | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 systems/Booster/benchmarks/Benchmark_usqcd.csv diff --git a/systems/Booster/benchmarks/Benchmark_usqcd.csv b/systems/Booster/benchmarks/Benchmark_usqcd.csv new file mode 100644 index 00000000..68689deb --- /dev/null +++ b/systems/Booster/benchmarks/Benchmark_usqcd.csv @@ -0,0 +1,70 @@ +Memory Bandwidth + +Bytes, GB/s per node +3145728, 225.900365 +50331648, 2858.859504 +254803968, 4145.556367 +805306368, 4905.772480 +1966080000, 4978.312557 + + +GEMM + + M, N, K, BATCH, GF/s per rank +16, 8, 16, 256, 1.713639 +16, 16, 16, 256, 288.268316 +16, 32, 16, 256, 597.053950 +32, 8, 32, 256, 557.382591 +32, 16, 32, 256, 1100.145311 +32, 32, 32, 256, 1885.080449 +64, 8, 64, 256, 1725.163599 +64, 16, 64, 256, 3389.336566 +64, 32, 64, 256, 4168.252422 +16, 8, 256, 256, 1326.262134 +16, 16, 256, 256, 2318.095475 +16, 32, 256, 256, 3555.436503 +32, 8, 256, 256, 1920.139170 +32, 16, 256, 256, 3486.174753 +32, 32, 256, 256, 5320.821724 +64, 8, 256, 256, 2539.597502 +64, 16, 256, 256, 5003.456775 +64, 32, 256, 256, 7837.531562 +8, 256, 16, 256, 1427.848170 +16, 256, 16, 256, 2222.147815 +32, 256, 16, 256, 2877.121715 +8, 256, 32, 256, 1922.890086 +16, 256, 32, 256, 3199.469082 +32, 256, 32, 256, 4845.405343 +8, 256, 64, 256, 2639.483343 +16, 256, 64, 256, 5012.800299 +32, 256, 64, 256, 7216.006882 + + + +Communications + +Packet bytes, direction, GB/s per node +4718592, 2, 206.570734 +4718592, 3, 207.501847 +4718592, 6, 189.730277 +4718592, 7, 204.301218 +15925248, 2, 307.882997 +15925248, 3, 287.901076 +15925248, 6, 295.603109 +15925248, 7, 300.682033 +37748736, 2, 331.740364 +37748736, 3, 338.610627 +37748736, 6, 332.580657 +37748736, 7, 336.336579 + + +Per node summary table + +L , Wilson, DWF4, Staggered, GF/s per node + +8 , 16, 1165, 10 +12 , 473, 4901, 163 +16 , 1436, 8464, 442 +24 , 4133, 10139, 1530 +32 , 5726, 11487, 2518 + From 7e5bd46dd3033aab62599c4cde1d1fc6bb7af8e7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 6 Mar 2024 19:03:45 +0100 Subject: [PATCH 17/17] Booster update --- Grid/algorithms/blas/BatchedBlas.h | 5 +++-- .../implementation/StaggeredKernelsImplementation.h | 12 +----------- systems/Booster/config-command | 6 ++++-- systems/Booster/sourceme.sh | 10 +++++----- 4 files changed, 13 insertions(+), 20 deletions(-) diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index 5ab46333..f6418b7e 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -31,7 +31,7 @@ Author: Peter Boyle #include #endif #ifdef GRID_CUDA -#include +#include #endif #ifdef GRID_SYCL #include @@ -51,7 +51,7 @@ NAMESPACE_BEGIN(Grid); typedef hipblasHandle_t gridblasHandle_t; #endif #ifdef GRID_CUDA - typedef cudablasHandle_t gridblasHandle_t; + typedef cublasHandle_t gridblasHandle_t; #endif #ifdef GRID_SYCL typedef cl::sycl::queue *gridblasHandle_t; @@ -78,6 +78,7 @@ public: #ifdef GRID_CUDA std::cout << "cublasCreate"<::DhopImproved(StencilImpl &st, LebesgueOrder &lo, if( interior && exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;} +#ifndef GRID_CUDA if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;} #endif } else if( interior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;} -#endif } else if( exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;} -#endif } assert(0 && " Kernel optimisation case not covered "); } @@ -322,19 +318,13 @@ void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, if( interior && exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;} -#endif } else if( interior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;} -#endif } else if( exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;} -#endif } } diff --git a/systems/Booster/config-command b/systems/Booster/config-command index 8530c5f9..1ba2dc7a 100644 --- a/systems/Booster/config-command +++ b/systems/Booster/config-command @@ -5,10 +5,12 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/ --enable-gen-simd-width=64 \ --enable-shm=nvlink \ --enable-accelerator=cuda \ + --disable-gparity \ + --disable-fermion-reps \ --with-lime=$LIME \ - --disable-accelerator-cshift \ + --enable-accelerator-cshift \ --disable-unified \ CXX=nvcc \ LDFLAGS="-cudart shared " \ - CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared" + CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared -lcublas" diff --git a/systems/Booster/sourceme.sh b/systems/Booster/sourceme.sh index 56499be4..2341267f 100644 --- a/systems/Booster/sourceme.sh +++ b/systems/Booster/sourceme.sh @@ -1,5 +1,5 @@ -module load GCC/9.3.0 -module load GMP/6.2.0 -module load MPFR/4.1.0 -module load OpenMPI/4.1.0rc1 -module load CUDA/11.3 +module load GCC +module load GMP +module load MPFR +module load OpenMPI +module load CUDA