diff --git a/.gitignore b/.gitignore index 40156f9d..94e866e2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +# Doxygen stuff +html/* +latex/* + # Compiled Object files # ######################### *.slo diff --git a/Grid/Grid_Eigen_Dense.h b/Grid/Grid_Eigen_Dense.h index bdd39a65..8bd1d113 100644 --- a/Grid/Grid_Eigen_Dense.h +++ b/Grid/Grid_Eigen_Dense.h @@ -34,7 +34,7 @@ #pragma push_macro("__SYCL_DEVICE_ONLY__") #undef __SYCL_DEVICE_ONLY__ #define EIGEN_DONT_VECTORIZE -//#undef EIGEN_USE_SYCL +#undef EIGEN_USE_SYCL #define __SYCL__REDEFINE__ #endif diff --git a/Grid/algorithms/approx/Zolotarev.cc b/Grid/algorithms/approx/Zolotarev.cc index c2efd41c..47779eae 100644 --- a/Grid/algorithms/approx/Zolotarev.cc +++ b/Grid/algorithms/approx/Zolotarev.cc @@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k, * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and * type = 1 for the approximation which is infinite at x = 0. */ -zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) { +zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) { INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F, l, invlambda, xi, xisq, *tv, s, opl; int m, czero, ts; @@ -375,12 +375,12 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) { construct_partfrac(d); construct_contfrac(d); - /* Converting everything to PRECISION for external use only */ + /* Converting everything to ZOLO_PRECISION for external use only */ zd = (zolotarev_data*) malloc(sizeof(zolotarev_data)); - zd -> A = (PRECISION) d -> A; - zd -> Delta = (PRECISION) d -> Delta; - zd -> epsilon = (PRECISION) d -> epsilon; + zd -> A = (ZOLO_PRECISION) d -> A; + zd -> Delta = (ZOLO_PRECISION) d -> Delta; + zd -> epsilon = (ZOLO_PRECISION) d -> epsilon; zd -> n = d -> n; zd -> type = d -> type; zd -> dn = d -> dn; @@ -390,24 +390,24 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) { zd -> deg_num = d -> deg_num; zd -> deg_denom = d -> deg_denom; - zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION)); - for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m]; + zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m]; free(d -> a); - zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION)); - for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m]; + zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m]; free(d -> ap); - zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION)); - for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m]; + zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m]; free(d -> alpha); - zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION)); - for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m]; + zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m]; free(d -> beta); - zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION)); - for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m]; + zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m]; free(d -> gamma); free(d); @@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata) } -zolotarev_data* higham(PRECISION epsilon, int n) { +zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) { INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq; int m, czero; zolotarev_data *zd; @@ -481,9 +481,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) { /* Converting everything to PRECISION for external use only */ zd = (zolotarev_data*) malloc(sizeof(zolotarev_data)); - zd -> A = (PRECISION) d -> A; - zd -> Delta = (PRECISION) d -> Delta; - zd -> epsilon = (PRECISION) d -> epsilon; + zd -> A = (ZOLO_PRECISION) d -> A; + zd -> Delta = (ZOLO_PRECISION) d -> Delta; + zd -> epsilon = (ZOLO_PRECISION) d -> epsilon; zd -> n = d -> n; zd -> type = d -> type; zd -> dn = d -> dn; @@ -493,24 +493,24 @@ zolotarev_data* higham(PRECISION epsilon, int n) { zd -> deg_num = d -> deg_num; zd -> deg_denom = d -> deg_denom; - zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION)); - for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m]; + zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m]; free(d -> a); - zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION)); - for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m]; + zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m]; free(d -> ap); - zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION)); - for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m]; + zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m]; free(d -> alpha); - zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION)); - for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m]; + zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m]; free(d -> beta); - zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION)); - for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m]; + zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION)); + for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m]; free(d -> gamma); free(d); @@ -523,17 +523,17 @@ NAMESPACE_END(Grid); #ifdef TEST #undef ZERO -#define ZERO ((PRECISION) 0) +#define ZERO ((ZOLO_PRECISION) 0) #undef ONE -#define ONE ((PRECISION) 1) +#define ONE ((ZOLO_PRECISION) 1) #undef TWO -#define TWO ((PRECISION) 2) +#define TWO ((ZOLO_PRECISION) 2) /* Evaluate the rational approximation R(x) using the factored form */ -static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION R; + ZOLO_PRECISION R; if (rdata -> type == 0) { R = rdata -> A * x; @@ -551,9 +551,9 @@ static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) { /* Evaluate the rational approximation R(x) using the partial fraction form */ -static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION R = rdata -> alpha[rdata -> da - 1]; + ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1]; for (m = 0; m < rdata -> dd; m++) R += rdata -> alpha[m] / (x * x - rdata -> ap[m]); if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x); @@ -568,18 +568,18 @@ static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) { * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0, * but with signalling overflow you will get an error message. */ -static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION R = rdata -> beta[0] * x; + ZOLO_PRECISION R = rdata -> beta[0] * x; for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R; return R; } /* Evaluate the rational approximation R(x) using Cayley form */ -static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) { +static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { int m; - PRECISION T; + ZOLO_PRECISION T; T = rdata -> type == 0 ? ONE : -ONE; for (m = 0; m < rdata -> n; m++) @@ -607,7 +607,7 @@ int main(int argc, char** argv) { int m, n, plotpts = 5000, type = 0; float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr; zolotarev_data *rdata; - PRECISION y; + ZOLO_PRECISION y; FILE *plot_function, *plot_error, *plot_partfrac, *plot_contfrac, *plot_cayley; @@ -626,13 +626,13 @@ int main(int argc, char** argv) { } rdata = type == 2 - ? higham((PRECISION) eps, n) - : zolotarev((PRECISION) eps, n, type); + ? higham((ZOLO_PRECISION) eps, n) + : zolotarev((ZOLO_PRECISION) eps, n, type); printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION) "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION) - "\tPRECISION = " STRINGIFY(PRECISION) + "\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION) "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n" "\tDelta = %g (maximum error)\n\n" "\tA = %g (overall factor)\n", @@ -681,15 +681,15 @@ int main(int argc, char** argv) { x = 2.4 * (float) m / plotpts - 1.2; if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) { /* skip x = 0 for type 1, as R(0) is singular */ - y = zolotarev_eval((PRECISION) x, rdata); + y = zolotarev_eval((ZOLO_PRECISION) x, rdata); fprintf(plot_function, "%g %g\n", x, (float) y); fprintf(plot_error, "%g %g\n", x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta)); - ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y) + ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y) / rdata -> Delta); - ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y) + ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y) / rdata -> Delta); - ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y) + ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y) / rdata -> Delta); if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) { maxypferr = MAX(maxypferr, fabs(ypferr)); diff --git a/Grid/algorithms/approx/Zolotarev.h b/Grid/algorithms/approx/Zolotarev.h index 800cf3c7..3c983cd3 100644 --- a/Grid/algorithms/approx/Zolotarev.h +++ b/Grid/algorithms/approx/Zolotarev.h @@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx); #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY> #ifndef ZOLOTAREV_INTERNAL -#ifndef PRECISION -#define PRECISION double +#ifndef ZOLO_PRECISION +#define ZOLO_PRECISION double #endif -#define ZPRECISION PRECISION +#define ZPRECISION ZOLO_PRECISION #define ZOLOTAREV_DATA zolotarev_data #endif @@ -77,8 +77,8 @@ typedef struct { * zolotarev_data structure. The arguments must satisfy the constraints that * epsilon > 0, n > 0, and type = 0 or 1. */ -ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ; -ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type); +ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ; +ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type); void zolotarev_free(zolotarev_data *zdata); #endif @@ -86,3 +86,4 @@ void zolotarev_free(zolotarev_data *zdata); NAMESPACE_END(Approx); NAMESPACE_END(Grid); #endif + diff --git a/Grid/algorithms/blas/BatchedBlas.cc b/Grid/algorithms/blas/BatchedBlas.cc new file mode 100644 index 00000000..e79ab1a9 --- /dev/null +++ b/Grid/algorithms/blas/BatchedBlas.cc @@ -0,0 +1,34 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: BatchedBlas.h + + Copyright (C) 2023 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +NAMESPACE_BEGIN(Grid); +gridblasHandle_t GridBLAS::gridblasHandle; +int GridBLAS::gridblasInit; +NAMESPACE_END(Grid); + diff --git a/Grid/algorithms/blas/BatchedBlas.h b/Grid/algorithms/blas/BatchedBlas.h index 82da2d5d..a7edb485 100644 --- a/Grid/algorithms/blas/BatchedBlas.h +++ b/Grid/algorithms/blas/BatchedBlas.h @@ -31,10 +31,16 @@ Author: Peter Boyle #include #endif #ifdef GRID_CUDA -#include +#include #endif #ifdef GRID_SYCL -#error // need oneMKL version +#include +#endif +#if 0 +#define GRID_ONE_MKL +#endif +#ifdef GRID_ONE_MKL +#include #endif /////////////////////////////////////////////////////////////////////// @@ -46,12 +52,15 @@ NAMESPACE_BEGIN(Grid); typedef hipblasHandle_t gridblasHandle_t; #endif #ifdef GRID_CUDA - typedef cudablasHandle_t gridblasHandle_t; + typedef cublasHandle_t gridblasHandle_t; #endif #ifdef GRID_SYCL - typedef int32_t gridblasHandle_t; + typedef cl::sycl::queue *gridblasHandle_t; #endif -#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) +#ifdef GRID_ONE_MKL + typedef cl::sycl::queue *gridblasHandle_t; +#endif +#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) typedef int32_t gridblasHandle_t; #endif @@ -70,12 +79,19 @@ public: #ifdef GRID_CUDA std::cout << "cublasCreate"<wait(); #endif } @@ -252,13 +271,16 @@ public: #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) // Need a default/reference implementation + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { ComplexD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + for (int kk = 0; kk < k; ++kk) + c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; + Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; } } } @@ -348,14 +370,19 @@ public: #warning "oneMKL implementation not built " #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; + ComplexF alphaf(real(alpha),imag(alpha)); + ComplexF betaf(real(beta),imag(beta)); // Need a default/reference implementation for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { - ComplexD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + ComplexF c_mn(0.0); + for (int kk = 0; kk < k; ++kk) + c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; + Cmn[p][mm + nn*ldc] = (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ]; } } } @@ -444,14 +471,17 @@ public: #warning "oneMKL implementation not built " #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; // Need a default/reference implementation for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { RealD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + for (int kk = 0; kk < k; ++kk) + c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; + Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; } } } @@ -558,14 +588,17 @@ public: #warning "oneMKL implementation not built " #endif #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) + int sda = lda*k; + int sdb = ldb*k; + int sdc = ldc*n; // Need a default/reference implementation for (int p = 0; p < batchCount; ++p) { for (int mm = 0; mm < m; ++mm) { for (int nn = 0; nn < n; ++nn) { RealD c_mn(0.0); - for (int kk = 0; kk < k, ++kk) - c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; - Cmn[mm + nn*ldc + p*sdc] = (*alpha_p)*c_mn + (*beta_p)*Cmn[mm + nn*ldc + p*sdc]; + for (int kk = 0; kk < k; ++kk) + c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; + Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; } } } @@ -601,9 +634,9 @@ public: deviceVector beta_p(1); acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); - std::cout << "blasZgemmStridedBatched mnk "< A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); deviceVector B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); deviceVector C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); ComplexD alpha(1.0); ComplexD beta (1.0); - for(int i=0;i<10;i++){ - RealD t0 = usecond(); - for(int s=0;s NAMESPACE_BEGIN(Grid); -extern Vector > Cshift_table; +extern std::vector > Cshift_table; +extern commVector > Cshift_table_device; +inline std::pair *MapCshiftTable(void) +{ + // GPU version +#ifdef ACCELERATOR_CSHIFT + uint64_t sz=Cshift_table.size(); + if (Cshift_table_device.size()!=sz ) { + Cshift_table_device.resize(sz); + } + acceleratorCopyToDevice((void *)&Cshift_table[0], + (void *)&Cshift_table_device[0], + sizeof(Cshift_table[0])*sz); + + return &Cshift_table_device[0]; +#else + return &Cshift_table[0]; +#endif + // CPU version use identify map +} /////////////////////////////////////////////////////////////////// // Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// @@ -74,8 +93,8 @@ Gather_plane_simple (const Lattice &rhs,cshiftVector &buffer,int dim } { auto buffer_p = & buffer[0]; - auto table = &Cshift_table[0]; -#ifdef ACCELERATOR_CSHIFT + auto table = MapCshiftTable(); +#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); @@ -225,7 +244,7 @@ template void Scatter_plane_simple (Lattice &rhs,cshiftVector< { auto buffer_p = & buffer[0]; - auto table = &Cshift_table[0]; + auto table = MapCshiftTable(); #ifdef ACCELERATOR_CSHIFT autoView( rhs_v, rhs, AcceleratorWrite); accelerator_for(i,ent,vobj::Nsimd(),{ @@ -297,30 +316,6 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA } } -#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) - -template -T iDivUp(T a, T b) // Round a / b to nearest higher integer value -{ return (a % b != 0) ? (a / b + 1) : (a / b); } - -template -__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride) -{ - int idx = blockIdx.x*blockDim.x + threadIdx.x; - if (idx >= e1*e2) return; - - int n, b, o; - - n = idx / e2; - b = idx % e2; - o = n*stride + b; - - vector[2*idx + 0] = lo + o; - vector[2*idx + 1] = ro + o; -} - -#endif - ////////////////////////////////////////////////////// // local to node block strided copies ////////////////////////////////////////////////////// @@ -345,20 +340,12 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs int ent=0; if(cbmask == 0x3 ){ -#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) - ent = e1*e2; - dim3 blockSize(acceleratorThreads()); - dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); - populate_Cshift_table<<>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); - accelerator_barrier(); -#else for(int n=0;n(lo+o,ro+o); } } -#endif } else { for(int n=0;n void Copy_plane(Lattice& lhs,const Lattice &rhs } { - auto table = &Cshift_table[0]; + auto table = MapCshiftTable(); #ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); autoView(lhs_v , lhs, AcceleratorWrite); @@ -409,19 +396,11 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); - accelerator_barrier(); -#else for(int n=0;n(lo+o+b,ro+o+b); }} -#endif } else { for(int n=0;n void Copy_plane_permute(Lattice& lhs,const Lattice Lattice Cshift(const Lattice &rhs,int dimension int comm_dim = rhs.Grid()->_processors[dimension] >1 ; int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim); - + RealD t1,t0; + t0=usecond(); if ( !comm_dim ) { //std::cout << "CSHIFT: Cshift_local" < Lattice Cshift(const Lattice &rhs,int dimension //std::cout << "CSHIFT: Cshift_comms" < void Cshift_comms(Lattice &ret,const Lattice &r int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); - + RealD tcopy=0.0; + RealD tgather=0.0; + RealD tscatter=0.0; + RealD tcomms=0.0; + uint64_t xbytes=0; for(int x=0;x void Cshift_comms(Lattice &ret,const Lattice &r int bytes = words * sizeof(vobj); + tgather-=usecond(); Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); + tgather+=usecond(); // int rank = grid->_processor; int recv_from_rank; int xmit_to_rank; grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - - grid->Barrier(); + + tcomms-=usecond(); + // grid->Barrier(); grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, (void *)&recv_buf[0], recv_from_rank, bytes); + xbytes+=bytes; + // grid->Barrier(); + tcomms+=usecond(); - grid->Barrier(); - + tscatter-=usecond(); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); + tscatter+=usecond(); } } + /* + std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -190,6 +210,12 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice=0); assert(shiftPermuteType(dimension); /////////////////////////////////////////////// @@ -227,7 +253,9 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms_simd(Lattice &ret,const LatticeShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - grid->Barrier(); + tcomms-=usecond(); + // grid->Barrier(); send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; recv_buf_extract_mpi = &recv_buf_extract[i][0]; @@ -262,7 +291,9 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeBarrier(); + xbytes+=bytes; + // grid->Barrier(); + tcomms+=usecond(); rpointers[i] = &recv_buf_extract[i][0]; } else { @@ -270,9 +301,17 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -292,6 +331,11 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(comm_dim==1); assert(shift>=0); assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; static cshiftVector send_buf_v; send_buf_v.resize(buffer_size); @@ -315,7 +359,9 @@ template void Cshift_comms(Lattice &ret,const Lattice &r if (comm_proc==0) { + tcopy-=usecond(); Copy_plane(ret,rhs,dimension,x,sx,cbmask); + tcopy+=usecond(); } else { @@ -324,7 +370,9 @@ template void Cshift_comms(Lattice &ret,const Lattice &r int bytes = words * sizeof(vobj); + tgather-=usecond(); Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); + tgather+=usecond(); // int rank = grid->_processor; int recv_from_rank; @@ -332,7 +380,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); - grid->Barrier(); + tcomms-=usecond(); + // grid->Barrier(); acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); grid->SendToRecvFrom((void *)&send_buf[0], @@ -340,13 +389,24 @@ template void Cshift_comms(Lattice &ret,const Lattice &r (void *)&recv_buf[0], recv_from_rank, bytes); + xbytes+=bytes; acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); - grid->Barrier(); + // grid->Barrier(); + tcomms+=usecond(); + tscatter-=usecond(); Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); + tscatter+=usecond(); } } + /* + std::cout << GridLogPerformance << " Cshift copy "< void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) @@ -372,6 +432,11 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice=0); assert(shiftPermuteType(dimension); @@ -414,8 +479,10 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice void Cshift_comms_simd(Lattice &ret,const LatticeShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - grid->Barrier(); + tcomms-=usecond(); + // grid->Barrier(); acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); grid->SendToRecvFrom((void *)send_buf_extract_mpi, @@ -449,17 +517,28 @@ template void Cshift_comms_simd(Lattice &ret,const LatticeBarrier(); + // grid->Barrier(); + tcomms+=usecond(); rpointers[i] = &recv_buf_extract[i][0]; } else { rpointers[i] = &send_buf_extract[nbr_lane][0]; } } + tscatter-=usecond(); Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); - } + tscatter+=usecond(); + } + /* + std::cout << GridLogPerformance << " Cshift (s) copy "< NAMESPACE_BEGIN(Grid); -Vector > Cshift_table; +std::vector > Cshift_table; +commVector > Cshift_table_device; NAMESPACE_END(Grid); diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index 6343db99..79572949 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -35,6 +35,7 @@ Author: Peter Boyle #include #include #include +#include #include #include #include @@ -46,5 +47,4 @@ Author: Peter Boyle #include #include #include -#include #include diff --git a/Grid/lattice/Lattice_arith.h b/Grid/lattice/Lattice_arith.h index aebc093a..5b37532f 100644 --- a/Grid/lattice/Lattice_arith.h +++ b/Grid/lattice/Lattice_arith.h @@ -270,5 +270,42 @@ RealD axpby_norm(Lattice &ret,sobj a,sobj b,const Lattice &x,const L return axpby_norm_fast(ret,a,b,x,y); } +/// Trace product +template auto traceProduct(const Lattice &rhs_1,const Lattice &rhs_2) + -> Lattice +{ + typedef decltype(trace(obj())) robj; + Lattice ret_i(rhs_1.Grid()); + autoView( rhs1 , rhs_1, AcceleratorRead); + autoView( rhs2 , rhs_2, AcceleratorRead); + autoView( ret , ret_i, AcceleratorWrite); + ret.Checkerboard() = rhs_1.Checkerboard(); + accelerator_for(ss,rhs1.size(),obj::Nsimd(),{ + coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2(ss))); + }); + return ret_i; +} + +template auto traceProduct(const Lattice &rhs_1,const obj2 &rhs2) + -> Lattice +{ + typedef decltype(trace(obj1())) robj; + Lattice ret_i(rhs_1.Grid()); + autoView( rhs1 , rhs_1, AcceleratorRead); + autoView( ret , ret_i, AcceleratorWrite); + ret.Checkerboard() = rhs_1.Checkerboard(); + accelerator_for(ss,rhs1.size(),obj1::Nsimd(),{ + coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2)); + }); + return ret_i; +} +template auto traceProduct(const obj2 &rhs_2,const Lattice &rhs_1) + -> Lattice +{ + return traceProduct(rhs_1,rhs_2); +} + + + NAMESPACE_END(Grid); #endif diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index 0928cbd7..03a869fb 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) basis_v.push_back(basis[k].View(AcceleratorWrite)); } -#if ( (!defined(GRID_CUDA)) ) +#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) ) int max_threads = thread_max(); Vector < vobj > Bt(Nm * max_threads); thread_region diff --git a/Grid/lattice/Lattice_crc.h b/Grid/lattice/Lattice_crc.h index 1005d196..f9aca1f6 100644 --- a/Grid/lattice/Lattice_crc.h +++ b/Grid/lattice/Lattice_crc.h @@ -42,13 +42,13 @@ template void DumpSliceNorm(std::string s,const Lattice &f,int } } -template uint32_t crc(Lattice & buf) +template uint32_t crc(const Lattice & buf) { autoView( buf_v , buf, CpuRead); return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites()); } -#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "< #if defined(GRID_SYCL) #include #endif +#include NAMESPACE_BEGIN(Grid); @@ -305,6 +306,7 @@ template inline ComplexD innerProduct(const Lattice &left,const Lattice &right) { GridBase *grid = left.Grid(); ComplexD nrm = rankInnerProduct(left,right); + // std::cerr<<"flight log " << std::hexfloat << nrm <<" "<GlobalSum(nrm); return nrm; } @@ -469,19 +471,10 @@ template inline void sliceSum(const Lattice &Data,std::vector< int e1= grid->_slice_nblock[orthogdim]; int e2= grid->_slice_block [orthogdim]; int stride=grid->_slice_stride[orthogdim]; - - // sum over reduced dimension planes, breaking out orthog dir - // Parallel over orthog direction - autoView( Data_v, Data, CpuRead); - thread_for( r,rd, { - int so=r*grid->_ostride[orthogdim]; // base offset for start of plane - for(int n=0;n_ostride[orthogdim]; + + //Reduce Data down to lvSum + sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd); // Sum across simd lanes in the plane, breaking out orthog dir. Coordinate icoor(Nd); @@ -525,6 +518,7 @@ sliceSum(const Lattice &Data,int orthogdim) return result; } + template static void sliceInnerProductVector( std::vector & result, const Lattice &lhs,const Lattice &rhs,int orthogdim) { diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index b7819c36..e82494f5 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -30,7 +30,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator & cudaGetDevice(&device); #endif #ifdef GRID_HIP - auto discard=hipGetDevice(&device); + auto r=hipGetDevice(&device); #endif Iterator warpSize = gpu_props[device].warpSize; diff --git a/Grid/lattice/Lattice_rng.h b/Grid/lattice/Lattice_rng.h index a19edf00..88028a59 100644 --- a/Grid/lattice/Lattice_rng.h +++ b/Grid/lattice/Lattice_rng.h @@ -152,6 +152,7 @@ public: #ifdef RNG_FAST_DISCARD static void Skip(RngEngine &eng,uint64_t site) { +#if 0 ///////////////////////////////////////////////////////////////////////////////////// // Skip by 2^40 elements between successive lattice sites // This goes by 10^12. @@ -162,9 +163,9 @@ public: // tens of seconds per trajectory so this is clean in all reasonable cases, // and margin of safety is orders of magnitude. // We could hack Sitmo to skip in the higher order words of state if necessary - // - // Replace with 2^30 ; avoid problem on large volumes - // + // + // Replace with 2^30 ; avoid problem on large volumes + // ///////////////////////////////////////////////////////////////////////////////////// // uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init const int shift = 30; @@ -179,6 +180,9 @@ public: assert((skip >> shift)==site); // check for overflow eng.discard(skip); +#else + eng.discardhi(site); +#endif // std::cout << " Engine " < +#if defined(GRID_CUDA) + +#include +#define gpucub cub +#define gpuError_t cudaError_t +#define gpuSuccess cudaSuccess + +#elif defined(GRID_HIP) + +#include +#define gpucub hipcub +#define gpuError_t hipError_t +#define gpuSuccess hipSuccess + +#endif + + +NAMESPACE_BEGIN(Grid); + + +#if defined(GRID_CUDA) || defined(GRID_HIP) +template inline void sliceSumReduction_cub_small(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { + size_t subvol_size = e1*e2; + commVector reduction_buffer(rd*subvol_size); + auto rb_p = &reduction_buffer[0]; + vobj zero_init; + zeroit(zero_init); + + + void *temp_storage_array = NULL; + size_t temp_storage_bytes = 0; + vobj *d_out; + int* d_offsets; + + std::vector offsets(rd+1,0); + + for (int i = 0; i < offsets.size(); i++) { + offsets[i] = i*subvol_size; + } + + //Allocate memory for output and offset arrays on device + d_out = static_cast(acceleratorAllocDevice(rd*sizeof(vobj))); + + d_offsets = static_cast(acceleratorAllocDevice((rd+1)*sizeof(int))); + + //copy offsets to device + acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream); + + + gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream); + if (gpuErr!=gpuSuccess) { + std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr < inline void sliceSumReduction_cub_large(const vobj *Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { + typedef typename vobj::vector_type vector; + const int words = sizeof(vobj)/sizeof(vector); + const int osites = rd*e1*e2; + commVectorbuffer(osites); + vector *dat = (vector *)Data; + vector *buf = &buffer[0]; + Vector lvSum_small(rd); + vector *lvSum_ptr = (vector *)&lvSum[0]; + + for (int w = 0; w < words; w++) { + accelerator_for(ss,osites,1,{ + buf[ss] = dat[ss*words+w]; + }); + + sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); + + for (int r = 0; r < rd; r++) { + lvSum_ptr[w+words*r]=lvSum_small[r]; + } + + } + + +} + +template inline void sliceSumReduction_cub(const Lattice &Data, Vector &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) +{ + autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case. + if constexpr (sizeof(vobj) <= 256) { + sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + } + else { + sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); + } +} +#endif + + +#if defined(GRID_SYCL) +template inline void sliceSumReduction_sycl(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +{ + typedef typename vobj::scalar_object sobj; + size_t subvol_size = e1*e2; + + vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator); + vobj vobj_zero; + zeroit(vobj_zero); + + commVector reduction_buffer(rd*subvol_size); + + auto rb_p = &reduction_buffer[0]; + + autoView(Data_v, Data, AcceleratorRead); + + //prepare reduction buffer + accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{ + + int n = s / e2; + int b = s % e2; + int so=r*ostride; // base offset for start of plane + int ss= so+n*stride+b; + + coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss])); + + }); + + for (int r = 0; r < rd; r++) { + mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { + auto Reduction = cl::sycl::reduction(mysum,std::plus<>()); + cgh.parallel_for(cl::sycl::range<1>{subvol_size}, + Reduction, + [=](cl::sycl::id<1> item, auto &sum) { + auto s = item[0]; + sum += rb_p[r*subvol_size+s]; + }); + }); + theGridAccelerator->wait(); + lvSum[r] = mysum[0]; + } + + free(mysum,*theGridAccelerator); +} +#endif + +template inline void sliceSumReduction_cpu(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +{ + // sum over reduced dimension planes, breaking out orthog dir + // Parallel over orthog direction + autoView( Data_v, Data, CpuRead); + thread_for( r,rd, { + int so=r*ostride; // base offset for start of plane + for(int n=0;n inline void sliceSumReduction(const Lattice &Data, Vector &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) +{ + #if defined(GRID_CUDA) || defined(GRID_HIP) + + sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); + + #elif defined(GRID_SYCL) + + sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); + + #else + sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); + + #endif +} + + +NAMESPACE_END(Grid); diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h index cb568abd..064c10e6 100644 --- a/Grid/lattice/Lattice_view.h +++ b/Grid/lattice/Lattice_view.h @@ -45,6 +45,7 @@ public: }; // Host only GridBase * getGrid(void) const { return _grid; }; + vobj* getHostPointer(void) const { return _odata; }; }; ///////////////////////////////////////////////////////////////////////////////////////// diff --git a/Grid/log/Log.h b/Grid/log/Log.h index 2d663a3c..370b0428 100644 --- a/Grid/log/Log.h +++ b/Grid/log/Log.h @@ -179,11 +179,11 @@ extern GridLogger GridLogSolver; extern GridLogger GridLogError; extern GridLogger GridLogWarning; extern GridLogger GridLogMessage; -extern GridLogger GridLogDebug ; +extern GridLogger GridLogDebug; extern GridLogger GridLogPerformance; extern GridLogger GridLogDslash; -extern GridLogger GridLogIterative ; -extern GridLogger GridLogIntegrator ; +extern GridLogger GridLogIterative; +extern GridLogger GridLogIntegrator; extern GridLogger GridLogHMC; extern GridLogger GridLogMemory; extern GridLogger GridLogTracing; @@ -191,6 +191,41 @@ extern Colours GridLogColours; std::string demangle(const char* name) ; +template +inline std::string sjoin(Args&&... args) noexcept { + std::ostringstream msg; + (msg << ... << args); + return msg.str(); +} + +/*! @brief make log messages work like python print */ +template +inline void Grid_log(Args&&... args) { + std::string msg = sjoin(std::forward(args)...); + std::cout << GridLogMessage << msg << std::endl; +} + +/*! @brief make warning messages work like python print */ +template +inline void Grid_warn(Args&&... args) { + std::string msg = sjoin(std::forward(args)...); + std::cout << "\033[33m" << GridLogWarning << msg << "\033[0m" << std::endl; +} + +/*! @brief make error messages work like python print */ +template +inline void Grid_error(Args&&... args) { + std::string msg = sjoin(std::forward(args)...); + std::cout << "\033[31m" << GridLogError << msg << "\033[0m" << std::endl; +} + +/*! @brief make pass messages work like python print */ +template +inline void Grid_pass(Args&&... args) { + std::string msg = sjoin(std::forward(args)...); + std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl; +} + #define _NBACKTRACE (256) extern void * Grid_backtrace_buffer[_NBACKTRACE]; diff --git a/Grid/perfmon/Tracing.h b/Grid/perfmon/Tracing.h index 5000cef4..10b638dc 100644 --- a/Grid/perfmon/Tracing.h +++ b/Grid/perfmon/Tracing.h @@ -34,7 +34,7 @@ class GridTracer { }; inline void tracePush(const char *name) { roctxRangePushA(name); } inline void tracePop(const char *name) { roctxRangePop(); } -inline int traceStart(const char *name) { roctxRangeStart(name); } +inline int traceStart(const char *name) { return roctxRangeStart(name); } inline void traceStop(int ID) { roctxRangeStop(ID); } #endif diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h index d34702c1..8acae81b 100644 --- a/Grid/qcd/action/ActionBase.h +++ b/Grid/qcd/action/ActionBase.h @@ -129,6 +129,22 @@ public: virtual ~Action(){} }; +template +class EmptyAction : public Action +{ + virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions + virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action + virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative + + /////////////////////////////// + // Logging + /////////////////////////////// + virtual std::string action_name() { return std::string("Level Force Log"); }; + virtual std::string LogParameters() { return std::string("No parameters");}; +}; + + + NAMESPACE_END(Grid); #endif // ACTION_BASE_H diff --git a/Grid/qcd/action/fermion/WilsonTMFermion.h b/Grid/qcd/action/fermion/WilsonTMFermion.h index 12c4b71a..cca716a0 100644 --- a/Grid/qcd/action/fermion/WilsonTMFermion.h +++ b/Grid/qcd/action/fermion/WilsonTMFermion.h @@ -63,7 +63,9 @@ public: virtual void MooeeDag(const FermionField &in, FermionField &out) ; virtual void MooeeInv(const FermionField &in, FermionField &out) ; virtual void MooeeInvDag(const FermionField &in, FermionField &out) ; - + virtual void M(const FermionField &in, FermionField &out) ; + virtual void Mdag(const FermionField &in, FermionField &out) ; + private: RealD mu; // TwistedMass parameter diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h index dd62e109..a39b529f 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h @@ -280,20 +280,16 @@ void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, if( interior && exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;} +#ifndef GRID_CUDA if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;} #endif } else if( interior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;} -#endif } else if( exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;} -#endif } assert(0 && " Kernel optimisation case not covered "); } @@ -322,19 +318,13 @@ void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, if( interior && exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;} -#endif } else if( interior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;} -#endif } else if( exterior ) { if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;} -#ifndef GRID_CUDA if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;} -#endif } } diff --git a/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h index 9a1a152c..12771c29 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonTMFermionImplementation.h @@ -93,5 +93,25 @@ void WilsonTMFermion::MooeeInvDag(const FermionField &in, FermionField &ou RealD b = tm /sq; axpibg5x(out,in,a,b); } +template +void WilsonTMFermion::M(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + this->Dhop(in, out, DaggerNo); + FermionField tmp(out.Grid()); + RealD a = 4.0+this->mass; + RealD b = this->mu; + axpibg5x(tmp,in,a,b); + axpy(out, 1.0, tmp, out); +} +template +void WilsonTMFermion::Mdag(const FermionField &in, FermionField &out) { + out.Checkerboard() = in.Checkerboard(); + this->Dhop(in, out, DaggerYes); + FermionField tmp(out.Grid()); + RealD a = 4.0+this->mass; + RealD b = -this->mu; + axpibg5x(tmp,in,a,b); + axpy(out, 1.0, tmp, out); +} NAMESPACE_END(Grid); diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 4dd5a634..0276b6fd 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -87,6 +87,8 @@ public: const ActionSet as; + ActionSet LevelForces; + //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default static MomentumFilterBase const* getDefaultMomFilter(){ static MomentumFilterNone filter; @@ -124,6 +126,9 @@ public: // input U actually not used in the fundamental case // Fundamental updates, include smearing + assert(as.size()==LevelForces.size()); + + Field level_force(U.Grid()); level_force =Zero(); for (int a = 0; a < as[level].actions.size(); ++a) { double start_full = usecond(); @@ -144,7 +149,10 @@ public: MomFilter->applyFilter(force); std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x]) Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR; @@ -167,6 +175,16 @@ public: } + { + // total force + Real force_abs = std::sqrt(norm2(level_force)/U.Grid()->gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x]) + Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR; + + Real force_max = std::sqrt(maxLocalNorm2(level_force)); + Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR; + LevelForces[level].actions.at(0)->deriv_log(force_abs,force_max,impulse_abs,impulse_max); + } + // Force from the other representations as[level].apply(update_P_hireps, Representations, Mom, U, ep); @@ -216,6 +234,16 @@ public: //Default the momentum filter to "do-nothing" MomFilter = getDefaultMomFilter(); + + for (int level = 0; level < as.size(); ++level) { + int multiplier = as.at(level).multiplier; + ActionLevel * Level = new ActionLevel(multiplier); + Level->push_back(new EmptyAction); + LevelForces.push_back(*Level); + // does it copy by value or reference?? + // - answer it copies by value, BUT the action level contains a reference that is NOT updated. + // Unsafe code in Guido's area + } }; virtual ~Integrator() {} @@ -233,10 +261,14 @@ public: void reset_timer(void) { + assert(as.size()==LevelForces.size()); for (int level = 0; level < as.size(); ++level) { for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { as[level].actions.at(actionID)->reset_timer(); } + int actionID=0; + assert(LevelForces.at(level).actions.size()==1); + LevelForces.at(level).actions.at(actionID)->reset_timer(); } } void print_timer(void) @@ -298,6 +330,16 @@ public: <<" calls " << as[level].actions.at(actionID)->deriv_num << std::endl; } + int actionID=0; + std::cout << GridLogMessage + << LevelForces[level].actions.at(actionID)->action_name() + <<"["<deriv_max_average() + <<" norm " << LevelForces[level].actions.at(actionID)->deriv_norm_average() + <<" Fdt max " << LevelForces[level].actions.at(actionID)->Fdt_max_average() + <<" Fdt norm " << LevelForces[level].actions.at(actionID)->Fdt_norm_average() + <<" calls " << LevelForces[level].actions.at(actionID)->deriv_num + << std::endl; } std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; } @@ -319,6 +361,13 @@ public: std::cout << as[level].actions.at(actionID)->LogParameters(); } } + std::cout << " [Integrator] Total Force loggers: "<< LevelForces.size() <action_name() << "] ID: " << actionID << std::endl; + } + } std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; } @@ -400,6 +449,7 @@ public: RealD S(Field& U) { // here also U not used + assert(as.size()==LevelForces.size()); std::cout << GridLogIntegrator << "Integrator action\n"; RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom diff --git a/Grid/qcd/smearing/GaugeConfigurationMasked.h b/Grid/qcd/smearing/GaugeConfigurationMasked.h index 4309c470..78846263 100644 --- a/Grid/qcd/smearing/GaugeConfigurationMasked.h +++ b/Grid/qcd/smearing/GaugeConfigurationMasked.h @@ -1,3 +1,4 @@ + /*! @file GaugeConfiguration.h @brief Declares the GaugeConfiguration class @@ -6,6 +7,15 @@ NAMESPACE_BEGIN(Grid); + +template void Dump(const Lattice & lat, + std::string s, + Coordinate site = Coordinate({0,0,0,0})) +{ + typename T::scalar_object tmp; + peekSite(tmp,lat,site); + std::cout << " Dump "< WL; + GaugeLinkField staple(grid), u_tmp(grid); + GaugeLinkField iLambda_mu(grid), iLambda_nu(grid); + GaugeLinkField U_mu(grid), U_nu(grid); + GaugeLinkField sh_field(grid), temp_Sigma(grid); + Real rho_munu, rho_numu; + + rho_munu = rho; + rho_numu = rho; + for(int mu = 0; mu < Nd; ++mu){ + U_mu = peekLorentz( U, mu); + iLambda_mu = peekLorentz(iLambda, mu); + + for(int nu = 0; nu < Nd; ++nu){ + if(nu==mu) continue; + + U_nu = peekLorentz( U, nu); + + // Nd(nd-1) = 12 staples normally. + // We must compute 6 of these + // in FTHMC case + if ( (mu==mmu)||(nu==mmu) ) + WL.StapleUpper(staple, U, mu, nu); + + if(nu==mmu) { + iLambda_nu = peekLorentz(iLambda, nu); + + temp_Sigma = -rho_numu*staple*iLambda_nu; //ok + //-r_numu*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)*Lambda_nu(x) + Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); + + sh_field = Cshift(iLambda_nu, mu, 1);// general also for Gparity? + + temp_Sigma = rho_numu*sh_field*staple; //ok + //r_numu*Lambda_nu(mu)*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x) + Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); + } + + if ( mu == mmu ) { + sh_field = Cshift(iLambda_mu, nu, 1); + + temp_Sigma = -rho_munu*staple*U_nu*sh_field*adj(U_nu); //ok + //-r_munu*U_nu(x+mu)*Udag_mu(x+nu)*Lambda_mu(x+nu)*Udag_nu(x) + Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); + } + + // staple = Zero(); + sh_field = Cshift(U_nu, mu, 1); + + temp_Sigma = Zero(); + + if ( mu == mmu ) + temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu; + + if ( nu == mmu ) { + temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu; + + u_tmp = adj(U_nu)*iLambda_nu; + sh_field = Cshift(u_tmp, mu, 1); + temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu; + } + + sh_field = Cshift(temp_Sigma, nu, -1); + Gimpl::AddLink(SigmaTerm, sh_field, mu); + + } + } + } + + void BaseSmear(GaugeLinkField& Cup, const GaugeField& U,int mu,RealD rho) { + GridBase *grid = U.Grid(); + GaugeLinkField tmp_stpl(grid); + WilsonLoops WL; + Cup = Zero(); + for(int nu=0; nu(Dbc,tmp,b,c); // Adjoint rep } +#endif + tp+=usecond(); } - tmp = trace(MpInvJx * Dbc); + // Dump(Dbc_opt,"Dbc_opt"); + // Dump(Dbc,"Dbc"); + tpk-=usecond(); + tmp = trace(MpInvJx * Dbc_opt); PokeIndex(Fdet2,tmp,a); + tpk+=usecond(); } + t+=usecond(); + std::cout << GridLogPerformance << " Compute_MpInvJx_dNxxdSy " << t/1e3 << " ms proj "<(NxAd,tmp,c,b); } +#endif } } void ApplyMask(GaugeField &U,int smr) @@ -164,8 +301,7 @@ public: // Computes ALL the staples -- could compute one only and do it here RealD time; time=-usecond(); - this->StoutSmearing->BaseSmear(C, U); - Cmu = peekLorentz(C, mu); + BaseSmear(Cmu, U,mu,rho); ////////////////////////////////////////////////////////////////// // Assemble Luscher exp diff map J matrix @@ -209,6 +345,36 @@ public: // dJ(x)/dxe ////////////////////////////////////// time=-usecond(); +#if 1 + std::vector dJdX; dJdX.resize(8,grid); + std::vector TRb_s; TRb_s.resize(8); + AdjMatrixField tbXn(grid); + AdjMatrixField sumXtbX(grid); + AdjMatrixField t2(grid); + AdjMatrixField dt2(grid); + AdjMatrixField t3(grid); + AdjMatrixField dt3(grid); + AdjMatrixField aunit(grid); + + for(int b=0;b<8;b++){ + SU3Adjoint::generator(b, TRb_s[b]); + dJdX[b] = TRb_s[b]; + } + aunit = ComplexD(1.0); + // Could put into an accelerator_for + X = (-1.0)*ZxAd; + t2 = X; + for (int j = 12; j > 1; --j) { + t3 = t2*(1.0 / (j + 1)) + aunit; + t2 = X * t3; + for(int b=0;b<8;b++){ + dJdX[b]= TRb_s[b] * t3 + X * dJdX[b]*(1.0 / (j + 1)); + } + } + for(int b=0;b<8;b++){ + dJdX[b] = -dJdX[b]; + } +#else std::vector dJdX; dJdX.resize(8,grid); AdjMatrixField tbXn(grid); AdjMatrixField sumXtbX(grid); @@ -224,14 +390,15 @@ public: X = (-1.0)*ZxAd; t2 = X; dt2 = TRb; - for (int j = 20; j > 1; --j) { - t3 = t2*(1.0 / (j + 1)) + aunit; + for (int j = 12; j > 1; --j) { + t3 = t2*(1.0 / (j + 1)) + aunit; dt3 = dt2*(1.0 / (j + 1)); t2 = X * t3; dt2 = TRb * t3 + X * dt3; } dJdX[b] = -dt2; } +#endif time+=usecond(); std::cout << GridLogMessage << "dJx took "<StoutSmearing->BaseSmear(C, U); - Cmu = peekLorentz(C, mu); + double rho=this->StoutSmearing->SmearRho[1]; + BaseSmear(Cmu, U,mu,rho); + Umu = peekLorentz(U, mu); Complex ci(0,1); for(int b=0;b(Ncb,tmp,c,b); } +#endif } ////////////////////////////////////////////////////////////////// @@ -693,15 +865,19 @@ private: const GaugeField& GaugeK,int level) { GridBase* grid = GaugeK.Grid(); - GaugeField C(grid), SigmaK(grid), iLambda(grid); + GaugeField SigmaK(grid), iLambda(grid); GaugeField SigmaKPrimeA(grid); GaugeField SigmaKPrimeB(grid); GaugeLinkField iLambda_mu(grid); GaugeLinkField iQ(grid), e_iQ(grid); GaugeLinkField SigmaKPrime_mu(grid); GaugeLinkField GaugeKmu(grid), Cmu(grid); - - this->StoutSmearing->BaseSmear(C, GaugeK); + + int mmu= (level/2) %Nd; + int cb= (level%2); + double rho=this->StoutSmearing->SmearRho[1]; + + // Can override this to do one direction only. SigmaK = Zero(); iLambda = Zero(); @@ -712,18 +888,38 @@ private: // Could get away with computing only one polarisation here // int mu= (smr/2) %Nd; // SigmaKprime_A has only one component - for (int mu = 0; mu < Nd; mu++) +#if 0 + BaseSmear(Cmu, GaugeK,mu,rho); + GaugeKmu = peekLorentz(GaugeK, mu); + SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu); + iQ = Ta(Cmu * adj(GaugeKmu)); + this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu); + pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu); + pokeLorentz(iLambda, iLambda_mu, mu); + BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho); // derivative of SmearBase +#else + // GaugeField C(grid); + // this->StoutSmearing->BaseSmear(C, GaugeK); + // for (int mu = 0; mu < Nd; mu++) + int mu =mmu; + BaseSmear(Cmu, GaugeK,mu,rho); { - Cmu = peekLorentz(C, mu); + // Cmu = peekLorentz(C, mu); GaugeKmu = peekLorentz(GaugeK, mu); SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu); iQ = Ta(Cmu * adj(GaugeKmu)); this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu); pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu); pokeLorentz(iLambda, iLambda_mu, mu); + std::cout << " mu "<StoutSmearing->derivative(SigmaK, iLambda,GaugeK); // derivative of SmearBase - + // GaugeField SigmaKcopy(grid); + // SigmaKcopy = SigmaK; + BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho); // derivative of SmearBase + // this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK); // derivative of SmearBase + // SigmaKcopy = SigmaKcopy - SigmaK; + // std::cout << " BaseSmearDerivative fast path error" < + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* + @file HISQSmearing.h + @brief Declares classes related to HISQ smearing +*/ + + +#pragma once +#include +#include +#include + + +NAMESPACE_BEGIN(Grid); + + +// TODO: find a way to fold this into the stencil header. need to access grid to get +// Nd, since you don't want to inherit from QCD.h +/*! @brief append arbitrary shift path to shifts */ +template +void appendShift(std::vector& shifts, int dir, Args... args) { + Coordinate shift(Nd,0); + generalShift(shift, dir, args...); + // push_back creates an element at the end of shifts and + // assigns the data in the argument to it. + shifts.push_back(shift); +} + + +/*! @brief figure out the stencil index from mu and nu */ +accelerator_inline int stencilIndex(int mu, int nu) { + // Nshifts depends on how you built the stencil + int Nshifts = 6; + return Nshifts*nu + Nd*Nshifts*mu; +} + + +/*! @brief structure holding the link treatment */ +struct SmearingParameters{ + SmearingParameters(){} + Real c_1; // 1 link + Real c_naik; // Naik term + Real c_3; // 3 link + Real c_5; // 5 link + Real c_7; // 7 link + Real c_lp; // 5 link Lepage + SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) + : c_1(c1), + c_naik(cnaik), + c_3(c3), + c_5(c5), + c_7(c7), + c_lp(clp){} +}; + + +/*! @brief create fat links from link variables */ +template +class Smear_HISQ : public Gimpl { + +private: + GridCartesian* const _grid; + SmearingParameters _linkTreatment; + +public: + + INHERIT_GIMPL_TYPES(Gimpl); + typedef typename Gimpl::GaugeField GF; + typedef typename Gimpl::GaugeLinkField LF; + typedef typename Gimpl::ComplexField CF; + + // Don't allow default values here. + Smear_HISQ(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) + : _grid(grid), + _linkTreatment(c1,cnaik,c3,c5,c7,clp) { + assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); + assert(Nd == 4 && "HISQ smearing only defined for Nd==4"); + } + + // Allow to pass a pointer to a C-style, double array for MILC convenience + Smear_HISQ(GridCartesian* grid, double* coeff) + : _grid(grid), + _linkTreatment(coeff[0],coeff[1],coeff[2],coeff[3],coeff[4],coeff[5]) { + assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); + assert(Nd == 4 && "HISQ smearing only defined for Nd==4"); + } + + ~Smear_HISQ() {} + + // Intent: OUT--u_smr, u_naik + // IN--u_thin + void smear(GF& u_smr, GF& u_naik, GF& u_thin) const { + + SmearingParameters lt = this->_linkTreatment; + auto grid = this->_grid; + + // Create a padded cell of extra padding depth=1 and fill the padding. + int depth = 1; + PaddedCell Ghost(depth,grid); + GF Ughost = Ghost.Exchange(u_thin); + + // This is where auxiliary N-link fields and the final smear will be stored. + GF Ughost_fat(Ughost.Grid()); + GF Ughost_3link(Ughost.Grid()); + GF Ughost_5linkA(Ughost.Grid()); + GF Ughost_5linkB(Ughost.Grid()); + + // mu-nu plane stencil. We allow mu==nu to make indexing the stencil easier, + // but these entries will not be used. + std::vector shifts; + for(int mu=0;mu_permute,Nd)) U3matrix; + + int Nsites = U_v.size(); + auto gStencil_v = gStencil.View(); + + accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 3-link constructs + stencilElement SE0, SE1, SE2, SE3, SE4, SE5; + U3matrix U0, U1, U2, U3, U4, U5, W; + for(int nu=0;nu_offset; + SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset; + SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + SE5 = gStencil_v.GetEntry(s+5,site); int x_m_mu = SE5->_offset; + + // When you're deciding whether to take an adjoint, the question is: how is the + // stored link oriented compared to the one you want? If I imagine myself travelling + // with the to-be-updated link, I have two possible, alternative 3-link paths I can + // take, one starting by going to the left, the other starting by going to the right. + U0 = coalescedReadGeneralPermute(U_v[x_p_mu ](nu),SE0->_permute,Nd); + U1 = coalescedReadGeneralPermute(U_v[x_p_nu ](mu),SE1->_permute,Nd); + U2 = coalescedReadGeneralPermute(U_v[x ](nu),SE2->_permute,Nd); + U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd); + U4 = coalescedReadGeneralPermute(U_v[x_m_nu ](mu),SE4->_permute,Nd); + U5 = coalescedReadGeneralPermute(U_v[x_m_nu ](nu),SE4->_permute,Nd); + + // "left" "right" + W = U2*U1*adj(U0) + adj(U5)*U4*U3; + + // Save 3-link construct for later and add to smeared field. + coalescedWrite(U_3link_v[x](nu), W); + + // The index operator (x) returns the coalesced read on GPU. The view [] index returns + // a reference to the vector object. The [x](mu) returns a reference to the densely + // packed (contiguous in memory) mu-th element of the vector object. On CPU, + // coalescedRead/Write is the identity mapping assigning vector object to vector object. + // But on GPU it's non-trivial and maps scalar object to vector object and vice versa. + coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_3*W); + } + }) + + accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 5-link + stencilElement SE0, SE1, SE2, SE3, SE4, SE5; + U3matrix U0, U1, U2, U3, U4, U5, W; + int sigmaIndex = 0; + for(int nu=0;nu_offset; + SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset; + SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + + U0 = coalescedReadGeneralPermute( U_v[x_p_mu ](nu ),SE0->_permute,Nd); + U1 = coalescedReadGeneralPermute(U_3link_v[x_p_nu ](rho),SE1->_permute,Nd); + U2 = coalescedReadGeneralPermute( U_v[x ](nu ),SE2->_permute,Nd); + U3 = coalescedReadGeneralPermute( U_v[x_p_mu_m_nu](nu ),SE3->_permute,Nd); + U4 = coalescedReadGeneralPermute(U_3link_v[x_m_nu ](rho),SE4->_permute,Nd); + U5 = coalescedReadGeneralPermute( U_v[x_m_nu ](nu ),SE4->_permute,Nd); + + W = U2*U1*adj(U0) + adj(U5)*U4*U3; + + if(sigmaIndex<3) { + coalescedWrite(U_5linkA_v[x](rho), W); + } else { + coalescedWrite(U_5linkB_v[x](rho), W); + } + + coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_5*W); + sigmaIndex++; + } + } + }) + + accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 7-link + stencilElement SE0, SE1, SE2, SE3, SE4, SE5; + U3matrix U0, U1, U2, U3, U4, U5, W; + int sigmaIndex = 0; + for(int nu=0;nu_offset; + SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset; + SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset; + SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; + SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset; + + U0 = coalescedReadGeneralPermute(U_v[x_p_mu](nu),SE0->_permute,Nd); + if(sigmaIndex<3) { + U1 = coalescedReadGeneralPermute(U_5linkB_v[x_p_nu](rho),SE1->_permute,Nd); + } else { + U1 = coalescedReadGeneralPermute(U_5linkA_v[x_p_nu](rho),SE1->_permute,Nd); + } + U2 = coalescedReadGeneralPermute(U_v[x](nu),SE2->_permute,Nd); + U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd); + if(sigmaIndex<3) { + U4 = coalescedReadGeneralPermute(U_5linkB_v[x_m_nu](rho),SE4->_permute,Nd); + } else { + U4 = coalescedReadGeneralPermute(U_5linkA_v[x_m_nu](rho),SE4->_permute,Nd); + } + U5 = coalescedReadGeneralPermute(U_v[x_m_nu](nu),SE4->_permute,Nd); + + W = U2*U1*adj(U0) + adj(U5)*U4*U3; + + coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_7*W); + sigmaIndex++; + } + } + }) + + } // end mu loop + + // c1, c3, c5, c7 construct contributions + u_smr = Ghost.Extract(Ughost_fat) + lt.c_1*u_thin; + + // Load up U and V std::vectors to access thin and smeared links. + std::vector U(Nd, grid); + std::vector V(Nd, grid); + std::vector Vnaik(Nd, grid); + for (int mu = 0; mu < Nd; mu++) { + U[mu] = PeekIndex(u_thin, mu); + V[mu] = PeekIndex(u_smr, mu); + } + + for(int mu=0;mu(u_smr , V[mu] , mu); + PokeIndex(u_naik, Vnaik[mu], mu); + } + }; + + + // Intent: OUT--u_proj + // IN--u_mu + void projectU3(GF& u_proj, GF& u_mu) const { + + auto grid = this->_grid; + + LF V(grid), Q(grid), sqrtQinv(grid), id_3(grid), diff(grid); + CF c0(grid), c1(grid), c2(grid), g0(grid), g1(grid), g2(grid), S(grid), R(grid), theta(grid), + u(grid), v(grid), w(grid), den(grid), f0(grid), f1(grid), f2(grid); + + // Follow MILC 10.1103/PhysRevD.82.074501, eqs (B2-B3) and (C1-C8) + for (int mu = 0; mu < Nd; mu++) { + V = PeekIndex(u_mu, mu); + Q = adj(V)*V; + c0 = real(trace(Q)); + c1 = (1/2.)*real(trace(Q*Q)); + c2 = (1/3.)*real(trace(Q*Q*Q)); + S = (1/3.)*c1-(1/18.)*c0*c0; + if (norm2(S)<1e-28) { + g0 = (1/3.)*c0; g1 = g0; g2 = g1; + } else { + R = (1/2.)*c2-(1/3. )*c0*c1+(1/27.)*c0*c0*c0; + theta = acos(R*pow(S,-1.5)); + g0 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta-2*M_PI/3.); + g1 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta ); + g2 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta+2*M_PI/3.); + } +// if (fabs(Q.determinant()/(g0*g1*g2)-1.0) > 1e-5) { SVD } + u = sqrt(g0) + sqrt(g1) + sqrt(g2); + v = sqrt(g0*g1) + sqrt(g0*g2) + sqrt(g1*g2); + w = sqrt(g0*g1*g2); + den = w*(u*v-w); + f0 = (-w*(u*u+v)+u*v*v)/den; + f1 = (-w-u*u*u+2.*u*v)/den; + f2 = u/den; + id_3 = 1.; + + sqrtQinv = f0*id_3 + f1*Q + f2*Q*Q; + + PokeIndex(u_proj, V*sqrtQinv, mu); + } + }; + + +// void derivative(const GaugeField& Gauge) const { +// }; +}; + + +NAMESPACE_END(Grid); \ No newline at end of file diff --git a/Grid/qcd/smearing/Smearing.h b/Grid/qcd/smearing/Smearing.h index da5ede72..41a305ae 100644 --- a/Grid/qcd/smearing/Smearing.h +++ b/Grid/qcd/smearing/Smearing.h @@ -5,4 +5,5 @@ #include #include #include +#include diff --git a/Grid/qcd/smearing/StoutSmearing.h b/Grid/qcd/smearing/StoutSmearing.h index 641331dc..787ef104 100644 --- a/Grid/qcd/smearing/StoutSmearing.h +++ b/Grid/qcd/smearing/StoutSmearing.h @@ -69,7 +69,7 @@ public: /*! Construct stout smearing object from explicitly specified rho matrix */ Smear_Stout(const std::vector& rho_) : OwnedBase{new Smear_APE(rho_)}, SmearBase{OwnedBase.get()} { - std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector& " << rho_ << " )" << std::endl + std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector& " << rho_ << " )" << std::endl; assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3"); } diff --git a/Grid/qcd/utils/GaugeGroup.h b/Grid/qcd/utils/GaugeGroup.h index f92064f4..6811d247 100644 --- a/Grid/qcd/utils/GaugeGroup.h +++ b/Grid/qcd/utils/GaugeGroup.h @@ -100,6 +100,9 @@ class GaugeGroup { using iGroupMatrix = iScalar > >; template using iAlgebraVector = iScalar > >; + template + using iSUnAlgebraMatrix = + iScalar > >; static int su2subgroups(void) { return su2subgroups(group_name()); } ////////////////////////////////////////////////////////////////////////////////////////////////// @@ -128,10 +131,19 @@ class GaugeGroup { typedef Lattice LatticeMatrix; typedef Lattice LatticeMatrixF; typedef Lattice LatticeMatrixD; - + typedef Lattice LatticeAlgebraVector; typedef Lattice LatticeAlgebraVectorF; typedef Lattice LatticeAlgebraVectorD; + + typedef iSUnAlgebraMatrix vAlgebraMatrix; + typedef iSUnAlgebraMatrix vAlgebraMatrixF; + typedef iSUnAlgebraMatrix vAlgebraMatrixD; + + typedef Lattice LatticeAlgebraMatrix; + typedef Lattice LatticeAlgebraMatrixF; + typedef Lattice LatticeAlgebraMatrixD; + typedef iSU2Matrix SU2Matrix; typedef iSU2Matrix SU2MatrixF; @@ -160,7 +172,7 @@ class GaugeGroup { return generator(lieIndex, ta, group_name()); } - static void su2SubGroupIndex(int &i1, int &i2, int su2_index) { + static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index) { return su2SubGroupIndex(i1, i2, su2_index, group_name()); } @@ -389,6 +401,52 @@ class GaugeGroup { } } +// Ta are hermitian (?) +// Anti herm is i Ta basis +static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in, int b) +{ + conformable(in, out); + GridBase *grid = out.Grid(); + LatticeComplex tmp(grid); + Matrix ta; + // Using Luchang's projection convention + // 2 Tr{Ta Tb} A_b= 2/2 delta ab A_b = A_a + autoView(out_v,out,AcceleratorWrite); + autoView(in_v,in,AcceleratorRead); + int N = ncolour; + int NNm1 = N * (N - 1); + int hNNm1= NNm1/2; + RealD sqrt_2 = sqrt(2.0); + Complex ci(0.0,1.0); + for(int su2Index=0;su2IndexoSites(),1,{ + // in is traceless ANTI-hermitian whereas Grid generators are Hermitian. + // trace( Ta x Ci in) + // Bet I need to move to real part with mult by -i + out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2))); + out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1))); + }); + } + for(int diagIndex=0;diagIndexoSites(),vComplex::Nsimd(),{ + auto tmp = in_v[ss]()()(0,0); + for(int i=1;i diff --git a/Grid/qcd/utils/SUn.impl.h b/Grid/qcd/utils/SUn.impl.h index e19f970c..02fa161b 100644 --- a/Grid/qcd/utils/SUn.impl.h +++ b/Grid/qcd/utils/SUn.impl.h @@ -10,6 +10,7 @@ // doesn't get found by the scripts/filelist during bootstrapping. private: + template static int su2subgroups(GroupName::SU) { return (ncolour * (ncolour - 1)) / 2; } //////////////////////////////////////////////////////////////////////// @@ -576,3 +577,4 @@ static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeFie LieRandomize(pRNG,g,1.0); GaugeTransform(Umu,g); } + diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h index 665ab2d6..851ba172 100644 --- a/Grid/qcd/utils/WilsonLoops.h +++ b/Grid/qcd/utils/WilsonLoops.h @@ -464,7 +464,8 @@ public: //U_padded: the gauge link fields padded out using the PaddedCell class //Cell: the padded cell class //gStencil: the precomputed generalized local stencil for the staple - static void StaplePaddedAll(std::vector &staple, const std::vector &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) { + static void StaplePaddedAll(std::vector &staple, const std::vector &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) + { double t0 = usecond(); assert(U_padded.size() == Nd); assert(staple.size() == Nd); assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back()); @@ -489,7 +490,7 @@ public: autoView( gStaple_v , gStaple, AcceleratorWrite); auto gStencil_v = gStencil.View(AcceleratorRead); - accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + accelerator_for(ss, ggrid->oSites(), (size_t)ggrid->Nsimd(), { decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; stencil_ss = Zero(); int off = outer_off; @@ -1201,7 +1202,7 @@ public: autoView( gStaple_v , gStaple, AcceleratorWrite); auto gStencil_v = gStencil.View(AcceleratorRead); - accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + accelerator_for(ss, ggrid->oSites(), (size_t)ggrid->Nsimd(), { decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; stencil_ss = Zero(); int s=offset; diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index 1c5ac785..976d5568 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -1141,4 +1141,13 @@ template void gpermute(vobj & inout,int perm){ NAMESPACE_END(Grid); +#ifdef GRID_SYCL +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +template<> struct sycl::is_device_copyable : public std::true_type {}; +#endif + + #endif diff --git a/Grid/sitmo_rng/sitmo_prng_engine.hpp b/Grid/sitmo_rng/sitmo_prng_engine.hpp index c76a0080..2db90e24 100644 --- a/Grid/sitmo_rng/sitmo_prng_engine.hpp +++ b/Grid/sitmo_rng/sitmo_prng_engine.hpp @@ -218,6 +218,10 @@ public: // ------------------------------------------------- // misc // ------------------------------------------------- + void discardhi(uint64_t z) { + _s[3] += z; + encrypt_counter(); + } // req: 26.5.1.4 Random number engine requirements, p.908 table 117, row 9 // Advances e’s state ei to ei+z by any means equivalent to z @@ -387,4 +391,4 @@ private: #undef MIXK #undef MIX2 -#endif \ No newline at end of file +#endif diff --git a/Grid/stencil/GeneralLocalStencil.h b/Grid/stencil/GeneralLocalStencil.h index bace6aca..b6848977 100644 --- a/Grid/stencil/GeneralLocalStencil.h +++ b/Grid/stencil/GeneralLocalStencil.h @@ -48,7 +48,7 @@ class GeneralLocalStencilView { int _npoints; // Move to template param? GeneralStencilEntry* _entries_p; - accelerator_inline GeneralStencilEntry * GetEntry(int point,int osite) { + accelerator_inline GeneralStencilEntry * GetEntry(int point,int osite) const { return & this->_entries_p[point+this->_npoints*osite]; } void ViewClose(void){}; @@ -148,5 +148,55 @@ public: }; + +//////////////////////////////////////////////// +// Some machinery to streamline making a stencil +//////////////////////////////////////////////// + +class shiftSignal { +public: + enum { + BACKWARD_CONST = 16, + NO_SHIFT = -1 + }; +}; + +// TODO: put a check somewhere that BACKWARD_CONST > Nd! + +/*! @brief signals that you want to go backwards in direction dir */ +inline int Back(const int dir) { + // generalShift will use BACKWARD_CONST to determine whether we step forward or + // backward. Trick inspired by SIMULATeQCD. + return dir + shiftSignal::BACKWARD_CONST; +} + +/*! @brief shift one unit in direction dir */ +template +void generalShift(Coordinate& shift, int dir) { + if (dir >= shiftSignal::BACKWARD_CONST) { + dir -= shiftSignal::BACKWARD_CONST; + shift[dir]+=-1; + } else if (dir == shiftSignal::NO_SHIFT) { + ; // do nothing + } else { + shift[dir]+=1; + } +} + +/*! @brief follow a path of directions, shifting one unit in each direction */ +template +void generalShift(Coordinate& shift, int dir, Args... args) { + if (dir >= shiftSignal::BACKWARD_CONST) { + dir -= shiftSignal::BACKWARD_CONST; + shift[dir]+=-1; + } else if (dir == shiftSignal::NO_SHIFT) { + ; // do nothing + } else { + shift[dir]+=1; + } + generalShift(shift, args...); +} + + NAMESPACE_END(Grid); diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 7eb741b1..ef3aa821 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -706,7 +706,7 @@ public: } } } - std::cout << GridLogDebug << "BuildSurfaceList size is "< &directions, const std::vector &distances, - Parameters p=Parameters()) + Parameters p=Parameters(), + bool preserve_shm=false) { face_table_computed=0; _grid = grid; @@ -855,7 +856,9 @@ public: ///////////////////////////////////////////////////////////////////////////////// const int Nsimd = grid->Nsimd(); - _grid->ShmBufferFreeAll(); + // Allow for multiple stencils to exist simultaneously + if (!preserve_shm) + _grid->ShmBufferFreeAll(); int maxl=2; u_simd_send_buf.resize(maxl); diff --git a/Grid/tensors/Tensor_trace.h b/Grid/tensors/Tensor_trace.h index 6aa398f9..c73d0949 100644 --- a/Grid/tensors/Tensor_trace.h +++ b/Grid/tensors/Tensor_trace.h @@ -69,6 +69,35 @@ accelerator_inline auto trace(const iVector &arg) -> iVector = 0, IfNotGridTensor = 0> +accelerator_inline auto traceProduct( const S1 &arg1,const S2 &arg2) + -> decltype(arg1*arg2) +{ + return arg1*arg2; +} + +template +accelerator_inline auto traceProduct(const iMatrix &arg1,const iMatrix &arg2) -> iScalar +{ + iScalar ret; + zeroit(ret._internal); + for(int i=0;i +accelerator_inline auto traceProduct(const iScalar &arg1,const iScalar &arg2) -> iScalar +{ + iScalar ret; + ret._internal=traceProduct(arg1._internal,arg2._internal); + return ret; +} NAMESPACE_END(Grid); diff --git a/Grid/tensors/Tensor_traits.h b/Grid/tensors/Tensor_traits.h index 58fdc6ce..536e17f1 100644 --- a/Grid/tensors/Tensor_traits.h +++ b/Grid/tensors/Tensor_traits.h @@ -34,9 +34,12 @@ NAMESPACE_BEGIN(Grid); // These are the Grid tensors template struct isGridTensor : public std::false_type { static constexpr bool notvalue = true; }; - template struct isGridTensor> : public std::true_type { static constexpr bool notvalue = false; }; - template struct isGridTensor> : public std::true_type { static constexpr bool notvalue = false; }; - template struct isGridTensor> : public std::true_type { static constexpr bool notvalue = false; }; + template struct isGridTensor > : public std::true_type { static constexpr bool notvalue = false; }; + template struct isGridTensor >: public std::true_type { static constexpr bool notvalue = false; }; + template struct isGridTensor >: public std::true_type { static constexpr bool notvalue = false; }; + + template using IfGridTensor = Invoke::value, int> >; + template using IfNotGridTensor = Invoke::value, int> >; // Traits to identify scalars template struct isGridScalar : public std::false_type { static constexpr bool notvalue = true; }; @@ -401,3 +404,12 @@ NAMESPACE_BEGIN(Grid); }; NAMESPACE_END(Grid); + +#ifdef GRID_SYCL +template struct +sycl::is_device_copyable::value && (!std::is_trivially_copyable::value), + void>::type> + : public std::true_type {}; +#endif + diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index be6efe93..0d9a8874 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -7,6 +7,8 @@ uint32_t accelerator_threads=2; uint32_t acceleratorThreads(void) {return accelerator_threads;}; void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; +#define ENV_LOCAL_RANK_PALS "PALS_LOCAL_RANKID" +#define ENV_RANK_PALS "PALS_RANKID" #define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" #define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK" #define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID" @@ -228,8 +230,17 @@ void acceleratorInit(void) { rank = atoi(localRankStr); } + if ((localRankStr = getenv(ENV_LOCAL_RANK_PALS)) != NULL) + { + rank = atoi(localRankStr); + } if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} + if ((localRankStr = getenv(ENV_RANK_PALS )) != NULL) { world_rank = atoi(localRankStr);} + + char hostname[HOST_NAME_MAX+1]; + gethostname(hostname, HOST_NAME_MAX+1); + if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname); auto devices = cl::sycl::device::get_devices(); for(int d = 0;d()); #define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld"); + if ( world_rank == 0) { - GPU_PROP_STR(vendor); - GPU_PROP_STR(version); + GPU_PROP_STR(vendor); + GPU_PROP_STR(version); // GPU_PROP_STR(device_type); /* GPU_PROP(max_compute_units); @@ -259,7 +271,8 @@ void acceleratorInit(void) GPU_PROP(single_fp_config); */ // GPU_PROP(double_fp_config); - GPU_PROP(global_mem_size); + GPU_PROP(global_mem_size); + } } if ( world_rank == 0 ) { diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 66ceb269..48062b56 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -268,6 +268,8 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} +inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);} +inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);} inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch { @@ -297,17 +299,13 @@ inline int acceleratorIsCommunicable(void *ptr) #define GRID_SYCL_LEVEL_ZERO_IPC NAMESPACE_END(Grid); -#if 0 -#include -#include -#include -#include -#else + +// Force deterministic reductions +#define SYCL_REDUCTION_DETERMINISTIC #include #include #include #include -#endif NAMESPACE_BEGIN(Grid); @@ -336,23 +334,24 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \ - unsigned long nt=acceleratorThreads(); \ - unsigned long unum1 = num1; \ - unsigned long unum2 = num2; \ - if(nt < 8)nt=8; \ - cl::sycl::range<3> local {nt,1,nsimd}; \ - cl::sycl::range<3> global{unum1,unum2,nsimd}; \ - cgh.parallel_for( \ - cl::sycl::nd_range<3>(global,local), \ - [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ - [[intel::reqd_sub_group_size(16)]] \ - { \ - auto iter1 = item.get_global_id(0); \ - auto iter2 = item.get_global_id(1); \ - auto lane = item.get_global_id(2); \ - { __VA_ARGS__ }; \ - }); \ - }); + unsigned long nt=acceleratorThreads(); \ + if(nt < 8)nt=8; \ + unsigned long unum1 = num1; \ + unsigned long unum2 = num2; \ + unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \ + cl::sycl::range<3> local {nt,1,nsimd}; \ + cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \ + cgh.parallel_for( \ + cl::sycl::nd_range<3>(global,local), \ + [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ + [[intel::reqd_sub_group_size(16)]] \ + { \ + auto iter1 = item.get_global_id(0); \ + auto iter2 = item.get_global_id(1); \ + auto lane = item.get_global_id(2); \ + { if (iter1 < unum1){ __VA_ARGS__ } }; \ + }); \ + }); #define accelerator_barrier(dummy) { theGridAccelerator->wait(); } @@ -508,6 +507,12 @@ inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes { auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream); } +inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { + auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream); +} +inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { + auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream); +} inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); }; #endif @@ -664,4 +669,5 @@ template T acceleratorGet(T& dev) + NAMESPACE_END(Grid); diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index c70ad38a..f4fb776d 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -394,6 +394,9 @@ void Grid_init(int *argc,char ***argv) std::cout << GridLogMessage << "MPI is initialised and logging filters activated "< HMCWrapper; MD.name = std::string("MinimumNorm2"); - MD.MDsteps = 12; + MD.MDsteps = 24; MD.trajL = 1.0; HMCparameters HMCparams; - HMCparams.StartTrajectory = 0; + HMCparams.StartTrajectory = 104; HMCparams.Trajectories = 200; HMCparams.NoMetropolisUntil= 20; // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; - HMCparams.StartingType =std::string("HotStart"); + // HMCparams.StartingType =std::string("HotStart"); + HMCparams.StartingType =std::string("CheckpointStart"); HMCparams.MD = MD; HMCWrapper TheHMC(HMCparams); @@ -87,6 +88,7 @@ int main(int argc, char **argv) // here there is too much indirection typedef PlaquetteMod PlaqObs; TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// const int Ls = 16; @@ -134,7 +136,6 @@ int main(int argc, char **argv) //////////////////////////////////// ActionLevel Level1(1); ActionLevel Level2(2); - ActionLevel Level3(4); //////////////////////////////////// // Strange action @@ -191,7 +192,7 @@ int main(int argc, char **argv) Smear_Stout Stout(rho); SmearedConfigurationMasked SmearingPolicy(GridPtr, Nstep, Stout); JacobianAction Jacobian(&SmearingPolicy); - if( ApplySmearing ) Level2.push_back(&Jacobian); + if( ApplySmearing ) Level1.push_back(&Jacobian); std::cout << GridLogMessage << " Built the Jacobian "<< std::endl; @@ -200,7 +201,7 @@ int main(int argc, char **argv) ///////////////////////////////////////////////////////////// // GaugeAction.is_smeared = ApplySmearing; GaugeAction.is_smeared = true; - Level3.push_back(&GaugeAction); + Level2.push_back(&GaugeAction); std::cout << GridLogMessage << " ************************************************"<< std::endl; std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl; @@ -210,10 +211,11 @@ int main(int argc, char **argv) std::cout << GridLogMessage << " Running the FT HMC "<< std::endl; - TheHMC.TheAction.push_back(Level1); TheHMC.TheAction.push_back(Level2); - TheHMC.TheAction.push_back(Level3); + + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); TheHMC.Run(SmearingPolicy); // for smearing diff --git a/HMC/FTHMC2p1f_3GeV.cc b/HMC/FTHMC2p1f_3GeV.cc new file mode 100644 index 00000000..a8aa67f8 --- /dev/null +++ b/HMC/FTHMC2p1f_3GeV.cc @@ -0,0 +1,226 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Copyright (C) 2023 + +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +using namespace Grid; + +int main(int argc, char **argv) +{ + std::cout << std::setprecision(12); + + Grid_init(&argc, &argv); + int threads = GridThread::GetThreads(); + // here make a routine to print all the relevant information on the run + std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; + + // Typedefs to simplify notation + typedef WilsonImplR FermionImplPolicy; + typedef MobiusFermionD FermionAction; + typedef typename FermionAction::FermionField FermionField; + + typedef Grid::XmlReader Serialiser; + + //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + IntegratorParameters MD; + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Leap Frog"); + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Force Gradient"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("MinimumNorm2"); + MD.MDsteps = 24; + MD.trajL = 1.0; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 0; + HMCparams.Trajectories = 200; + HMCparams.NoMetropolisUntil= 20; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + // HMCparams.StartingType =std::string("HotStart"); + HMCparams.StartingType =std::string("ColdStart"); + // HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_EODWF_lat"; + CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr"; + CPparams.rng_prefix = "ckpoint_EODWF_rng"; + CPparams.saveInterval = 1; + CPparams.saveSmeared = true; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + + RNGModuleParameters RNGpar; + RNGpar.serial_seeds = "1 2 3 4 5"; + RNGpar.parallel_seeds = "6 7 8 9 10"; + TheHMC.Resources.SetRNGSeeds(RNGpar); + + // Construct observables + // here there is too much indirection + typedef PlaquetteMod PlaqObs; + TheHMC.Resources.AddObservable(); + + ////////////////////////////////////////////// + + const int Ls = 12; + Real beta = 2.37; + Real light_mass = 0.0047; + Real strange_mass = 0.0186; + Real pv_mass = 1.0; + RealD M5 = 1.8; + RealD b = 1.0; // Scale factor one, Shamir + RealD c = 0.0; + + OneFlavourRationalParams OFRp; + OFRp.lo = 1.0e-2; + OFRp.hi = 64; + OFRp.MaxIter = 10000; + OFRp.tolerance= 1.0e-10; + OFRp.degree = 14; + OFRp.precision= 40; + + std::vector hasenbusch({ 0.05, 0.1, 0.25, 0.5 }); + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeField U(GridPtr); + LatticeGaugeField Uhot(GridPtr); + + // These lines are unecessary if BC are all periodic + std::vector boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + + double StoppingCondition = 1e-10; + double MaxCGIterations = 30000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + + bool ApplySmearing = true; + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(2); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + ExactOneFlavourRatioPseudoFermionAction + EOFA(Strange_Op_L, Strange_Op_R, + CG, + CG, CG, + CG, CG, + OFRp, false); + + EOFA.is_smeared = ApplySmearing; + Level1.push_back(&EOFA); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector *> Quotients; + + for(int h=0;h(*Numerators[h],*Denominators[h],CG,CG)); + } + + for(int h=0;his_smeared = ApplySmearing; + Level1.push_back(Quotients[h]); + } + + ///////////////////////////////////////////////////////////// + // lnDetJacobianAction + ///////////////////////////////////////////////////////////// + double rho = 0.1; // smearing parameter + int Nsmear = 1; // number of smearing levels - must be multiple of 2Nd + int Nstep = 8*Nsmear; // number of smearing levels - must be multiple of 2Nd + Smear_Stout Stout(rho); + SmearedConfigurationMasked SmearingPolicy(GridPtr, Nstep, Stout); + JacobianAction Jacobian(&SmearingPolicy); + if( ApplySmearing ) Level1.push_back(&Jacobian); + std::cout << GridLogMessage << " Built the Jacobian "<< std::endl; + + + ///////////////////////////////////////////////////////////// + // Gauge action + ///////////////////////////////////////////////////////////// + GaugeAction.is_smeared = ApplySmearing; + Level2.push_back(&GaugeAction); + + std::cout << GridLogMessage << " ************************************************"<< std::endl; + std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl; + std::cout << GridLogMessage << " ************************************************"<< std::endl; + std::cout << GridLogMessage << std::endl; + std::cout << GridLogMessage << std::endl; + + + std::cout << GridLogMessage << " Running the FT HMC "<< std::endl; + TheHMC.TheAction.push_back(Level1); + TheHMC.TheAction.push_back(Level2); + + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + + TheHMC.Run(SmearingPolicy); // for smearing + + Grid_finalize(); +} // main + + + diff --git a/HMC/HMC2p1f_3GeV.cc b/HMC/HMC2p1f_3GeV.cc new file mode 100644 index 00000000..4bf088d7 --- /dev/null +++ b/HMC/HMC2p1f_3GeV.cc @@ -0,0 +1,226 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Copyright (C) 2023 + +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include +#include +#include + +using namespace Grid; + +int main(int argc, char **argv) +{ + std::cout << std::setprecision(12); + + Grid_init(&argc, &argv); + int threads = GridThread::GetThreads(); + // here make a routine to print all the relevant information on the run + std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; + + // Typedefs to simplify notation + typedef WilsonImplR FermionImplPolicy; + typedef MobiusFermionD FermionAction; + typedef typename FermionAction::FermionField FermionField; + + typedef Grid::XmlReader Serialiser; + + //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + IntegratorParameters MD; + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Leap Frog"); + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("Force Gradient"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("MinimumNorm2"); + MD.MDsteps = 24; + MD.trajL = 1.0; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 0; + HMCparams.Trajectories = 200; + HMCparams.NoMetropolisUntil= 20; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + // HMCparams.StartingType =std::string("HotStart"); + HMCparams.StartingType =std::string("ColdStart"); + // HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_EODWF_lat"; + CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr"; + CPparams.rng_prefix = "ckpoint_EODWF_rng"; + CPparams.saveInterval = 1; + CPparams.saveSmeared = true; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + + RNGModuleParameters RNGpar; + RNGpar.serial_seeds = "1 2 3 4 5"; + RNGpar.parallel_seeds = "6 7 8 9 10"; + TheHMC.Resources.SetRNGSeeds(RNGpar); + + // Construct observables + // here there is too much indirection + typedef PlaquetteMod PlaqObs; + TheHMC.Resources.AddObservable(); + + ////////////////////////////////////////////// + + const int Ls = 12; + Real beta = 2.37; + Real light_mass = 0.0047; + Real strange_mass = 0.0186; + Real pv_mass = 1.0; + RealD M5 = 1.8; + RealD b = 1.0; // Scale factor one, Shamir + RealD c = 0.0; + + OneFlavourRationalParams OFRp; + OFRp.lo = 1.0e-2; + OFRp.hi = 64; + OFRp.MaxIter = 10000; + OFRp.tolerance= 1.0e-10; + OFRp.degree = 14; + OFRp.precision= 40; + + std::vector hasenbusch({ 0.05, 0.1, 0.25, 0.5 }); + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeField U(GridPtr); + LatticeGaugeField Uhot(GridPtr); + + // These lines are unecessary if BC are all periodic + std::vector boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + + double StoppingCondition = 1e-10; + double MaxCGIterations = 30000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + + bool ApplySmearing = false; + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(2); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + ExactOneFlavourRatioPseudoFermionAction + EOFA(Strange_Op_L, Strange_Op_R, + CG, + CG, CG, + CG, CG, + OFRp, false); + + EOFA.is_smeared = ApplySmearing; + Level1.push_back(&EOFA); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector *> Quotients; + + for(int h=0;h(*Numerators[h],*Denominators[h],CG,CG)); + } + + for(int h=0;his_smeared = ApplySmearing; + Level1.push_back(Quotients[h]); + } + + ///////////////////////////////////////////////////////////// + // lnDetJacobianAction + ///////////////////////////////////////////////////////////// + double rho = 0.1; // smearing parameter + int Nsmear = 1; // number of smearing levels - must be multiple of 2Nd + int Nstep = 8*Nsmear; // number of smearing levels - must be multiple of 2Nd + Smear_Stout Stout(rho); + SmearedConfigurationMasked SmearingPolicy(GridPtr, Nstep, Stout); + JacobianAction Jacobian(&SmearingPolicy); + if( ApplySmearing ) Level1.push_back(&Jacobian); + std::cout << GridLogMessage << " Built the Jacobian "<< std::endl; + + + ///////////////////////////////////////////////////////////// + // Gauge action + ///////////////////////////////////////////////////////////// + GaugeAction.is_smeared = ApplySmearing; + Level2.push_back(&GaugeAction); + + std::cout << GridLogMessage << " ************************************************"<< std::endl; + std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl; + std::cout << GridLogMessage << " ************************************************"<< std::endl; + std::cout << GridLogMessage << std::endl; + std::cout << GridLogMessage << std::endl; + + + std::cout << GridLogMessage << " Running the FT HMC "<< std::endl; + TheHMC.TheAction.push_back(Level1); + TheHMC.TheAction.push_back(Level2); + + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + + TheHMC.Run(SmearingPolicy); // for smearing + + Grid_finalize(); +} // main + + + diff --git a/HMC/Mobius2p1f_DD_EOFA_96I_double.cc b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc new file mode 100644 index 00000000..a2af38f7 --- /dev/null +++ b/HMC/Mobius2p1f_DD_EOFA_96I_double.cc @@ -0,0 +1,350 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/Test_hmc_EODWFRatio.cc + +Copyright (C) 2015-2016 + +Author: Peter Boyle +Author: Guido Cossu + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +int main(int argc, char **argv) { + using namespace Grid; + + Grid_init(&argc, &argv); + + CartesianCommunicator::BarrierWorld(); + std::cout << GridLogMessage << " Clock skew check" < HMCWrapper; + // MD.name = std::string("Leap Frog"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("Force Gradient"); + //typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("MinimumNorm2"); + // TrajL = 2 + // 4/2 => 0.6 dH + // 3/3 => 0.8 dH .. depth 3, slower + //MD.MDsteps = 4; + MD.MDsteps = 3; + MD.trajL = 0.5; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 1077; + HMCparams.Trajectories = 1; + HMCparams.NoMetropolisUntil= 0; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + // HMCparams.StartingType =std::string("ColdStart"); + HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_DDHMC_lat"; + CPparams.rng_prefix = "ckpoint_DDHMC_rng"; + CPparams.saveInterval = 1; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + std::cout << "loaded NERSC checpointer"< PlaqObs; + TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// + + const int Ls = 12; + RealD M5 = 1.8; + RealD b = 1.5; + RealD c = 0.5; + Real beta = 2.13; + // Real light_mass = 5.4e-4; + Real light_mass = 7.8e-4; + Real light_mass_dir = 0.01; + Real strange_mass = 0.0362; + Real pv_mass = 1.0; + std::vector hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + // std::vector hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); + // std::vector hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated + // std::vector hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); + + int SP_iters=9000; + + RationalActionParams OFRp; // Up/down + OFRp.lo = 6.0e-5; + OFRp.hi = 90.0; + OFRp.inv_pow = 2; + OFRp.MaxIter = SP_iters; // get most shifts by 2000, stop sharing space + OFRp.action_tolerance= 1.0e-8; + OFRp.action_degree = 18; + OFRp.md_tolerance= 1.0e-7; + OFRp.md_degree = 14; + // OFRp.degree = 20; converges + // OFRp.degree = 16; + OFRp.precision= 80; + OFRp.BoundsCheckFreq=0; + std::vector ActionTolByPole({ + // 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 3.0e-7,1.0e-7,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8 + }); + std::vector MDTolByPole({ + // 1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more + // 1.0e-6,3.0e-7,1.0e-7,1.0e-7, + 1.0e-5,1.0e-6,1.0e-7,1.0e-7, // soften convergence + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8,1.0e-8,1.0e-8, + 1.0e-8,1.0e-8 + }); + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + + typedef SchurDiagMooeeOperator LinearOperatorD; + typedef SchurDiagMooeeOperator LinearOperatorEOFAD; + + //////////////////////////////////////////////////////////////// + // Domain decomposed + //////////////////////////////////////////////////////////////// + Coordinate latt4 = GridPtr->GlobalDimensions(); + Coordinate mpi = GridPtr->ProcessorGrid(); + Coordinate shm; + + GlobalSharedMemory::GetShmDims(mpi,shm); + + Coordinate CommDim(Nd); + for(int d=0;d1 ? 1 : 0; + + Coordinate NonDirichlet(Nd+1,0); + Coordinate Dirichlet(Nd+1,0); + Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + //Dirichlet[1] = 0; + //Dirichlet[2] = 0; + //Dirichlet[3] = 0; + + // + Coordinate Block4(Nd); + Block4[0] = Dirichlet[1]; + Block4[1] = Dirichlet[2]; + Block4[2] = Dirichlet[3]; + Block4[3] = Dirichlet[4]; + + int Width=4; + TheHMC.Resources.SetMomentumFilter(new DDHMCFilter(Block4,Width)); + + ////////////////////////// + // Fermion Grids + ////////////////////////// + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeFieldD U(GridPtr); U=Zero(); + + std::cout << GridLogMessage << " Running the HMC "<< std::endl; + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + std::cout << "loaded NERSC gauge field"< boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + FermionAction::ImplParams ParamsDir(boundary); + + Params.dirichlet=NonDirichlet; + ParamsDir.dirichlet=Dirichlet; + ParamsDir.partialDirichlet=0; + std::cout << GridLogMessage<< "Partial Dirichlet depth is "< CG(StoppingCondition,MaxCGIterations); + ConjugateGradient MDCG(MDStoppingCondition,MaxCGIterations); + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(3); + ActionLevel Level3(15); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); + FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); + + // Probably dominates the force - back to EOFA. + OneFlavourRationalParams SFRp; + SFRp.lo = 0.1; + SFRp.hi = 25.0; + SFRp.MaxIter = 10000; + SFRp.tolerance= 1.0e-8; + SFRp.mdtolerance= 2.0e-6; + SFRp.degree = 12; + SFRp.precision= 50; + + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + ConjugateGradient ActionCG(StoppingCondition,MaxCGIterations); + ConjugateGradient DerivativeCG(MDStoppingCondition,MaxCGIterations); + LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L); + LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R); + + ExactOneFlavourRatioPseudoFermionAction + EOFA(Strange_Op_L, Strange_Op_R, + ActionCG, + ActionCG, ActionCG, + DerivativeCG, DerivativeCG, + SFRp, true); + Level2.push_back(&EOFA); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + std::vector dirichlet_den; + std::vector dirichlet_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); dirichlet_den.push_back(0); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector *> Quotients; + + std::vector *> Bdys; + + typedef SchurDiagMooeeOperator LinearOperatorD; + std::vector LinOpD; + + for(int h=0;h(*Numerators[h],*Denominators[h],MDCG,CG)); + } else { + Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction(*Numerators[h],*Denominators[h],OFRp)); + } + } + for(int h=0;hSetTolerances(ActionTolByPole,MDTolByPole); + } + int nquo=Quotients.size(); + Level1.push_back(Bdys[0]); + Level1.push_back(Bdys[1]); + Level2.push_back(Quotients[0]); + for(int h=1;h MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD); @@ -180,7 +180,7 @@ int main(int argc, char **argv) { // 4/2 => 0.6 dH // 3/3 => 0.8 dH .. depth 3, slower //MD.MDsteps = 4; - MD.MDsteps = 14; + MD.MDsteps = 12; MD.trajL = 0.5; HMCparameters HMCparams; @@ -204,7 +204,7 @@ int main(int argc, char **argv) { TheHMC.Resources.LoadNerscCheckpointer(CPparams); std::cout << "loaded NERSC checpointer"< hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); - // std::vector hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); - std::vector hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 }); // Updated - // std::vector hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); + std::vector hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated + //std::vector hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated auto GridPtr = TheHMC.Resources.GetCartesian(); auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); @@ -277,20 +276,20 @@ int main(int argc, char **argv) { // double StoppingCondition = 1e-14; // double MDStoppingCondition = 1e-9; - double StoppingCondition = 1e-9; - double MDStoppingCondition = 1e-8; - double MDStoppingConditionLoose = 1e-8; - double MDStoppingConditionStrange = 1e-8; - double MaxCGIterations = 300000; + double StoppingCondition = 1e-14; + double MDStoppingCondition = 1e-9; + double MDStoppingConditionLoose = 1e-9; + double MDStoppingConditionStrange = 1e-9; + double MaxCGIterations = 50000; ConjugateGradient CG(StoppingCondition,MaxCGIterations); ConjugateGradient MDCG(MDStoppingCondition,MaxCGIterations); //////////////////////////////////// // Collect actions //////////////////////////////////// - // ActionLevel Level1(1); - ActionLevel Level2(1); - ActionLevel Level3(15); + ActionLevel Level1(1); + ActionLevel Level2(2); + ActionLevel Level3(4); //////////////////////////////////// // Strange action @@ -300,11 +299,11 @@ int main(int argc, char **argv) { // Probably dominates the force - back to EOFA. OneFlavourRationalParams SFRp; - SFRp.lo = 0.1; + SFRp.lo = 0.8; SFRp.hi = 30.0; SFRp.MaxIter = 10000; - SFRp.tolerance= 1.0e-8; - SFRp.mdtolerance= 2.0e-6; + SFRp.tolerance= 1.0e-12; + SFRp.mdtolerance= 1.0e-9; SFRp.degree = 10; SFRp.precision= 50; @@ -355,8 +354,10 @@ int main(int argc, char **argv) { ExactOneFlavourRatioPseudoFermionAction EOFA(Strange_Op_L, Strange_Op_R, ActionCG, - ActionCGL, ActionCGR, - DerivativeCGL, DerivativeCGR, + // ActionCGL, ActionCGR, + // DerivativeCGL, DerivativeCGR, + ActionCG, ActionCG, + DerivativeCG, DerivativeCG, SFRp, true); Level2.push_back(&EOFA); @@ -443,13 +444,14 @@ int main(int argc, char **argv) { } int nquo=Quotients.size(); for(int h=0;h +Author: Guido Cossu + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + + + +int main(int argc, char **argv) { + using namespace Grid; + + std::cout << " Grid Initialise "< HMCWrapper; + // MD.name = std::string("Leap Frog"); + typedef GenericHMCRunner HMCWrapper; + MD.name = std::string("Force Gradient"); + // typedef GenericHMCRunner HMCWrapper; + // MD.name = std::string("MinimumNorm2"); + // TrajL = 2 + // 4/2 => 0.6 dH + // 3/3 => 0.8 dH .. depth 3, slower + //MD.MDsteps = 4; + MD.MDsteps = 8; + MD.trajL = 0.5; + + HMCparameters HMCparams; + HMCparams.StartTrajectory = 1077; + HMCparams.Trajectories = 20; + HMCparams.NoMetropolisUntil= 0; + // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + HMCparams.StartingType =std::string("ColdStart"); + // HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.MD = MD; + HMCWrapper TheHMC(HMCparams); + + // Grid from the command line arguments --grid and --mpi + TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition + + CheckpointerParameters CPparams; + CPparams.config_prefix = "ckpoint_HMC_lat"; + CPparams.rng_prefix = "ckpoint_HMC_rng"; + CPparams.saveInterval = 1; + CPparams.format = "IEEE64BIG"; + TheHMC.Resources.LoadNerscCheckpointer(CPparams); + std::cout << "loaded NERSC checpointer"< PlaqObs; + TheHMC.Resources.AddObservable(); + ////////////////////////////////////////////// + + const int Ls = 12; + RealD M5 = 1.8; + RealD b = 1.5; + RealD c = 0.5; + RealD beta = 2.13; + // Real light_mass = 5.4e-4; + Real light_mass = 7.8e-4; + // Real light_mass = 7.8e-3; + Real strange_mass = 0.0362; + Real pv_mass = 1.0; + std::vector hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated + //std::vector hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated + + auto GridPtr = TheHMC.Resources.GetCartesian(); + auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); + + typedef SchurDiagMooeeOperator LinearOperatorD; + typedef SchurDiagMooeeOperator LinearOperatorEOFAD; + + //////////////////////////////////////////////////////////////// + // Domain decomposed + //////////////////////////////////////////////////////////////// + Coordinate latt4 = GridPtr->GlobalDimensions(); + Coordinate mpi = GridPtr->ProcessorGrid(); + Coordinate shm; + + GlobalSharedMemory::GetShmDims(mpi,shm); + + ////////////////////////// + // Fermion Grids + ////////////////////////// + auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); + auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); + + IwasakiGaugeActionR GaugeAction(beta); + + // temporarily need a gauge field + LatticeGaugeFieldD U(GridPtr); U=Zero(); + + std::cout << GridLogMessage << " Running the HMC "<< std::endl; + TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file + TheHMC.initializeGaugeFieldAndRNGs(U); + std::cout << "loaded NERSC gauge field"< boundary = {1,1,1,-1}; + FermionAction::ImplParams Params(boundary); + + // double StoppingCondition = 1e-14; + // double MDStoppingCondition = 1e-9; + double StoppingCondition = 1e-14; + double MDStoppingCondition = 1e-9; + double MDStoppingConditionLoose = 1e-9; + double MDStoppingConditionStrange = 1e-9; + double MaxCGIterations = 50000; + ConjugateGradient CG(StoppingCondition,MaxCGIterations); + ConjugateGradient MDCG(MDStoppingCondition,MaxCGIterations); + + //////////////////////////////////// + // Collect actions + //////////////////////////////////// + ActionLevel Level1(1); + ActionLevel Level2(2); + ActionLevel Level3(4); + + //////////////////////////////////// + // Strange action + //////////////////////////////////// + FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); + FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass, M5,b,c, Params); + + // Probably dominates the force - back to EOFA. + OneFlavourRationalParams SFRp; + SFRp.lo = 0.8; + SFRp.hi = 30.0; + SFRp.MaxIter = 10000; + SFRp.tolerance= 1.0e-12; + SFRp.mdtolerance= 1.0e-9; + SFRp.degree = 10; + SFRp.precision= 50; + + MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); + MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c); + ConjugateGradient ActionCG(StoppingCondition,MaxCGIterations); + ConjugateGradient DerivativeCG(MDStoppingCondition,MaxCGIterations); + LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L); + LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R); + + ExactOneFlavourRatioPseudoFermionAction + EOFA(Strange_Op_L, Strange_Op_R, + ActionCG, + ActionCG, ActionCG, + DerivativeCG, DerivativeCG, + SFRp, true); + Level2.push_back(&EOFA); + + //////////////////////////////////// + // up down action + //////////////////////////////////// + std::vector light_den; + std::vector light_num; + + int n_hasenbusch = hasenbusch.size(); + light_den.push_back(light_mass); + for(int h=0;h Numerators; + std::vector Denominators; + std::vector *> Quotients; + + std::vector *> Bdys; + + typedef SchurDiagMooeeOperator LinearOperatorD; + std::vector LinOpD; + + for(int h=0;h(*Numerators[h],*Denominators[h],MDCG,CG,CG)); + } + int nquo=Quotients.size(); + for(int h=0;h +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/************************************************************** + * GPU - GPU memory cartesian halo exchange benchmark + * Config: what is the target + ************************************************************** + */ +#undef ACC_CUDA +#undef ACC_HIP +#define ACC_SYCL +#undef ACC_NONE + +/************************************************************** + * Some MPI globals + ************************************************************** + */ +MPI_Comm WorldComm; +MPI_Comm WorldShmComm; + +int WorldSize; +int WorldRank; + +int WorldShmSize; +int WorldShmRank; + +/************************************************************** + * Allocate buffers on the GPU, SYCL needs an init call and context + ************************************************************** + */ +#ifdef ACC_CUDA +#include +void acceleratorInit(void){} +void *acceleratorAllocDevice(size_t bytes) +{ + void *ptr=NULL; + auto err = cudaMalloc((void **)&ptr,bytes); + assert(err==cudaSuccess); + return ptr; +} +void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);} +#endif +#ifdef ACC_HIP +#include +void acceleratorInit(void){} +inline void *acceleratorAllocDevice(size_t bytes) +{ + void *ptr=NULL; + auto err = hipMalloc((void **)&ptr,bytes); + if( err != hipSuccess ) { + ptr = (void *) NULL; + printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); + } + return ptr; +}; +inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);}; +#endif +#ifdef ACC_SYCL +#include +#include +cl::sycl::queue *theAccelerator; +void acceleratorInit(void) +{ + int nDevices = 1; +#if 1 + cl::sycl::gpu_selector selector; + cl::sycl::device selectedDevice { selector }; + theAccelerator = new sycl::queue (selectedDevice); +#else + cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v }; + theAccelerator = new sycl::queue (selectedDevice); +#endif + auto name = theAccelerator->get_device().get_info(); + printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout); +} +inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);}; +inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);}; +#endif +#ifdef ACC_NONE +void acceleratorInit(void){} +inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);}; +inline void acceleratorFreeDevice(void *ptr){free(ptr);}; +#endif + + +/************************************************************** + * Microsecond timer + ************************************************************** + */ +inline double usecond(void) { + struct timeval tv; + gettimeofday(&tv,NULL); + return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec; +} +/************************************************************** + * Main benchmark routine + ************************************************************** + */ +void Benchmark(int64_t L,std::vector cart_geom,bool use_device,int ncall) +{ + int64_t words = 3*4*2; + int64_t face,vol; + int Nd=cart_geom.size(); + + /************************************************************** + * L^Nd volume, L^(Nd-1) faces, 12 complex per site + * Allocate memory for these + ************************************************************** + */ + face=1; for( int d=0;d send_bufs; + std::vector recv_bufs; + size_t vw = face*words; + size_t bytes = face*words*sizeof(double); + + if ( use_device ) { + for(int d=0;d<2*Nd;d++){ + send_bufs.push_back(acceleratorAllocDevice(bytes)); + recv_bufs.push_back(acceleratorAllocDevice(bytes)); + } + } else { + for(int d=0;d<2*Nd;d++){ + send_bufs.push_back(malloc(bytes)); + recv_bufs.push_back(malloc(bytes)); + } + } + /********************************************************* + * Build cartesian communicator + ********************************************************* + */ + int ierr; + int rank; + std::vector coor(Nd); + MPI_Comm communicator; + std::vector periodic(Nd,1); + MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator); + MPI_Comm_rank(communicator,&rank); + MPI_Cart_coords(communicator,rank,Nd,&coor[0]); + + static int reported; + if ( ! reported ) { + printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank, + coor[0],coor[1],coor[2],coor[3]); fflush(stdout); + reported =1 ; + } + /********************************************************* + * Perform halo exchanges + ********************************************************* + */ + for(int d=0;d1 ) { + double t0=usecond(); + + int from,to; + + MPI_Barrier(communicator); + for(int n=0;n & vec) +{ + vec.resize(0); + std::stringstream ss(str); + int i; + while (ss >> i){ + vec.push_back(i); + if(std::ispunct(ss.peek())) + ss.ignore(); + } + return; +} +/************************************** + * Command line junk + **************************************/ +int main(int argc, char **argv) +{ + std::string arg; + + acceleratorInit(); + + MPI_Init(&argc,&argv); + + WorldComm = MPI_COMM_WORLD; + + MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); + + MPI_Comm_rank(WorldComm ,&WorldRank); + MPI_Comm_size(WorldComm ,&WorldSize); + + MPI_Comm_rank(WorldShmComm ,&WorldShmRank); + MPI_Comm_size(WorldShmComm ,&WorldShmSize); + + if ( WorldSize/WorldShmSize > 2) { + printf("This benchmark is meant to run on at most two nodes only\n"); + } + + auto mpi =std::vector({1,1,1,1}); + + if( CmdOptionExists(argv,argv+argc,"--mpi") ){ + arg = CmdOptionPayload(argv,argv+argc,"--mpi"); + CmdOptionIntVector(arg,mpi); + } else { + printf("Must specify --mpi command line argument\n"); + exit(0); + } + + if( !WorldRank ) { + printf("***********************************\n"); + printf("%d ranks\n",WorldSize); + printf("%d ranks-per-node\n",WorldShmSize); + printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout); + printf("Cartesian layout: "); + for(int d=0;d1 ? 1 : 0; - Dirichlet[0] = 0; - Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; - Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; - Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; - Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; + // Dirichlet[0] = 0; + // Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; + // Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; + // Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; + // Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; Benchmark(Ls,Dirichlet); @@ -105,11 +105,11 @@ int main (int argc, char ** argv) std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <1 ? 1 : 0; - Dirichlet[0] = 0; - Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; - Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; - Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; - Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; + // Dirichlet[0] = 0; + // Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; + // Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; + // Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; + // Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; Benchmark(Ls,Dirichlet); @@ -185,6 +185,7 @@ void Benchmark(int Ls, Coordinate Dirichlet) GaugeField Umu(UGrid); GaugeField UmuCopy(UGrid); SU::HotConfiguration(RNG4,Umu); + // SU::ColdConfiguration(Umu); UmuCopy=Umu; std::cout << GridLogMessage << "Random gauge initialised " << std::endl; @@ -307,6 +308,14 @@ void Benchmark(int Ls, Coordinate Dirichlet) if(( n2e>1.0e-4) ) { std::cout<Barrier(); + std::cout<Barrier(); exit(-1); } assert (n2e< 1.0e-4 ); diff --git a/benchmarks/Benchmark_usqcd.cc b/benchmarks/Benchmark_usqcd.cc new file mode 100644 index 00000000..3b729b9e --- /dev/null +++ b/benchmarks/Benchmark_usqcd.cc @@ -0,0 +1,968 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_usqcd.cc + + Copyright (C) 2015 + +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace Grid; + +std::vector L_list; +std::vector Ls_list; +std::vector mflop_list; + +double mflop_ref; +double mflop_ref_err; + +int NN_global; + +FILE * FP; + +struct time_statistics{ + double mean; + double err; + double min; + double max; + + void statistics(std::vector v){ + double sum = std::accumulate(v.begin(), v.end(), 0.0); + mean = sum / v.size(); + + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); + + auto result = std::minmax_element(v.begin(), v.end()); + min = *result.first; + max = *result.second; +} +}; + +void comms_header(){ + std::cout <1) nmu++; + + std::vector t_time(Nloop); + time_statistics timestat; + + std::cout< xbuf(8); + std::vector rbuf(8); + //Grid.ShmBufferFreeAll(); + uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + // bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + // bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + } + + // int ncomm; + double dbytes; + + for(int dir=0;dir<8;dir++) { + int mu =dir % 4; + if (mpi_layout[mu]>1 ) { + + std::vector times(Nloop); + for(int i=0;i > LatticeVec; + typedef iVector Vec; + + Coordinate simd_layout = GridDefaultSimd(Nd,vReal::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + fprintf(FP,"Memory Bandwidth\n\n"); + fprintf(FP,"Bytes, GB/s per node\n"); + std::cout<({45,12,81,9})); + for(int lat=8;lat<=lmax;lat+=8){ + + Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + // NP= Grid.RankCount(); + NN =Grid.NodeCount(); + + Vec rn ; random(sRNG,rn); + + LatticeVec z(&Grid); z=Zero(); + LatticeVec x(&Grid); x=Zero(); + LatticeVec y(&Grid); y=Zero(); + double a=2.0; + + uint64_t Nloop=NLOOP; + + double start=usecond(); + for(int i=0;i > LatticeSU4; + + Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + std::cout<({45,12,81,9})); + for(int lat=8;lat<=lmax;lat+=8){ + + Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + NN =Grid.NodeCount(); + + + LatticeSU4 z(&Grid); z=Zero(); + LatticeSU4 x(&Grid); x=Zero(); + LatticeSU4 y(&Grid); y=Zero(); + // double a=2.0; + + uint64_t Nloop=NLOOP; + + double start=usecond(); + for(int i=0;i mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); + Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; + uint64_t SHM=NP/NN; + + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + typedef DomainWallFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + ///////// Source preparation //////////// + Gauge Umu(UGrid); SU::HotConfiguration(RNG4,Umu); + Fermion src (FGrid); random(RNG5,src); + Fermion src_e (FrbGrid); + Fermion src_o (FrbGrid); + Fermion r_e (FrbGrid); + Fermion r_o (FrbGrid); + Fermion r_eo (FGrid); + Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + { + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd,src_o,src); + +#ifdef AVX512 + const int num_cases = 3; +#else + const int num_cases = 2; +#endif + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases [] = { + { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent } + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); + Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; + uint64_t SHM=NP/NN; + + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + RealD mass=0.1; + RealD c1=9.0/8.0; + RealD c2=-1.0/24.0; + RealD u0=1.0; + + typedef ImprovedStaggeredFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + Gauge Umu(FGrid); SU::HotConfiguration(RNG4,Umu); + + typename Action::ImplParams params; + Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params); + + ///////// Source preparation //////////// + Fermion src (FGrid); random(RNG4,src); + Fermion src_e (FrbGrid); + Fermion src_o (FrbGrid); + Fermion r_e (FrbGrid); + Fermion r_o (FrbGrid); + Fermion r_eo (FGrid); + + { + + pickCheckerboard(Even,src_e,src); + pickCheckerboard(Odd,src_o,src); + + const int num_cases = 2; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases [] = { + { StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { StaggeredKernelsStatic::OptHandUnroll, StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + { StaggeredKernelsStatic::OptInlineAsm , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent } + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=1; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops mflops_all; + + /////////////////////////////////////////////////////// + // Set/Get the layout & grid size + /////////////////////////////////////////////////////// + int threads = GridThread::GetThreads(); + Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); + Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); + + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, + GridDefaultSimd(Nd,vComplex::Nsimd()), + GridDefaultMpi()); + uint64_t NP = TmpGrid->RankCount(); + uint64_t NN = TmpGrid->NodeCount(); + NN_global=NN; + uint64_t SHM=NP/NN; + + + ///////// Welcome message //////////// + std::cout< seeds4({1,2,3,4}); + GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + RealD mass=0.1; + RealD csw=1.0; + + typedef WilsonCloverFermionF Action; + typedef typename Action::FermionField Fermion; + typedef LatticeGaugeFieldF Gauge; + + Gauge Umu(FGrid); SU::HotConfiguration(RNG4,Umu); + + Action Dc(Umu,*FGrid,*FrbGrid,mass,csw,csw); + + ///////// Source preparation //////////// + Fermion src (FGrid); random(RNG4,src); + Fermion r (FGrid); + + { + + const int num_cases = 1; + std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); + + controls Cases [] = { + { WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }, + }; + + for(int c=0;cBarrier(); + for(int i=0;iBarrier(); + double t1=usecond(); + uint64_t ncall = 500; + + FGrid->Broadcast(0,&ncall,sizeof(ncall)); + + // std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"< t_time(ncall); + for(uint64_t i=0;iBarrier(); + + double volume=1; for(int mu=0;mumflops_best ) mflops_best = mflops; + if ( mflops({2,2,2,2}); + + Benchmark::Decomposition(); + + int do_su4=0; + int do_memory=1; + int do_comms =1; + int do_blas =1; + + int sel=4; + std::vector L_list({8,12,16,24,32}); + int selm1=sel-1; + + std::vector clover; + std::vector dwf4; + std::vector staggered; + + int Ls=1; + std::cout<&2 fi -./scripts/update_eigen.sh ${ARC} -rm ${ARC} -# patch for non-portable includes in Eigen 3.3.5 -# apparently already fixed in Eigen HEAD so it should not be -# a problem in the future (A.P.) -patch Eigen/unsupported/Eigen/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch - +./scripts/update_eigen.sh "${ARC}" +rm "${ARC}" echo '-- generating Make.inc files...' ./scripts/filelist echo '-- generating configure script...' diff --git a/examples/Example_plaquette.cc b/examples/Example_plaquette.cc new file mode 100644 index 00000000..faf17d82 --- /dev/null +++ b/examples/Example_plaquette.cc @@ -0,0 +1,183 @@ +/* + * Example_plaquette.cc + * + * D. Clarke + * + * Here I just want to create an incredibly simple main to get started with GRID and get used + * to its syntax. If the reader is like me, they vaguely understand something about lattice coding, + * they don't know a ton of C++, don't know much of the fine details, and certainly know nothing about GRID. + * + * Once you've made a new executable, like this one, you can bootstrap.sh again. At this point, + * the code should be able to find your new executable. You can tell that bootstrap.sh worked by + * having a look at Make.inc. You should see your executable inside there. + * + * Warning: This code illustrative only, not well tested, and not meant for production use. The best + * way to read this code is to start at the main. + * + */ + + +// All your mains should have this +#include +using namespace Grid; + + +// This copies what already exists in WilsonLoops.h. The point here is to be pedagogical and explain in +// detail what everything does so we can see how GRID works. +template class WLoops : public Gimpl { +public: + // Gimpl seems to be an arbitrary class. Within this class, it is expected that certain types are + // already defined, things like Scalar and Field. This macro includes a bunch of #typedefs that + // implement this equivalence at compile time. + INHERIT_GIMPL_TYPES(Gimpl); + + // Some example Gimpls can be found in GaugeImplementations.h, at the bottom. These are in turn built + // out of GaugeImplTypes, which can be found in GaugeImplTypes.h. The GaugeImplTypes contain the base + // field/vector/link/whatever types. These inherit from iScalar, iVector, and iMatrix objects, which + // are sort of the building blocks for gerenal math objects. The "i" at the beginning of these names + // indicates that they should be for internal use only. It seems like these base types have the + // acceleration, e.g. SIMD or GPU or what-have-you, abstracted away. How you accelerate these things + // appears to be controlled through a template parameter called vtype. + + // The general math/physics objects, such as a color matrix, are built up by nesting these objects. + // For instance a general color matrix has two color indices, so it's built up like + // iScalar &U, const int mu, const int nu) { + // Calls like CovShiftForward and CovShiftBackward have 3 arguments, and they multiply together + // the first and last argument. (Second arg gives the shift direction.) The CovShiftIdentityBackward + // has meanwhile only two arguments; it just returns the shifted (adjoint since backward) link. + plaq = Gimpl::CovShiftForward(U[mu],mu, + // Means Link*Cshift(field,mu,1), arguments are Link, mu, field in that order. + Gimpl::CovShiftForward(U[nu],nu, + Gimpl::CovShiftBackward(U[mu],mu, + // This means Cshift(adj(Link), mu, -1) + Gimpl::CovShiftIdentityBackward(U[nu], nu)))); + } + + // tr U_mu_nu(x) + static void traceDirPlaquette(ComplexField &plaq, const std::vector &U, const int mu, const int nu) { + // This .Grid() syntax seems to get the pointer to the GridBase. Apparently this is needed as argument + // to instantiate a Lattice object. + GaugeMat sp(U[0].Grid()); + dirPlaquette(sp, U, mu, nu); + plaq = trace(sp); + } + + // sum_mu_nu tr U_mu_nu(x) + static void sitePlaquette(ComplexField &Plaq, const std::vector &U) { + ComplexField sitePlaq(U[0].Grid()); + Plaq = Zero(); + // Nd=4 and Nc=3 are set as global constants in QCD.h + for (int mu = 1; mu < Nd; mu++) { + for (int nu = 0; nu < mu; nu++) { + traceDirPlaquette(sitePlaq, U, mu, nu); + Plaq = Plaq + sitePlaq; + } + } + } + + // sum_mu_nu_x Re tr U_mu_nu(x) + static RealD sumPlaquette(const GaugeLorentz &Umu) { + std::vector U(Nd, Umu.Grid()); + for (int mu = 0; mu < Nd; mu++) { + // Umu is a GaugeLorentz object, and as such has a non-trivial Lorentz index. We can + // access the element in the mu Lorentz index with this PeekIndex syntax. + U[mu] = PeekIndex(Umu, mu); + } + ComplexField Plaq(Umu.Grid()); + sitePlaquette(Plaq, U); + // I guess this should be the line that sums over all space-time sites. + auto Tp = sum(Plaq); + // Until now, we have been working with objects inside the tensor nest. This TensorRemove gets + // rid of the tensor nest to return whatever is inside. + auto p = TensorRemove(Tp); + return p.real(); + } + + // < Re tr U_mu_nu(x) > + static RealD avgPlaquette(const GaugeLorentz &Umu) { + // Real double type + RealD sumplaq = sumPlaquette(Umu); + // gSites() is the number of global sites. there is also lSites() for local sites. + double vol = Umu.Grid()->gSites(); + // The number of orientations. 4*3/2=6 for Nd=4, as known. + double faces = (1.0 * Nd * (Nd - 1)) / 2.0; + return sumplaq / vol / faces / Nc; + } +}; + + +// Next we show an example of how to construct an input parameter class. We first inherit +// from Serializable. Then all class data members have to be defined using the +// GRID_SERIALIZABLE_CLASS_MEMBERS macro. This variadic macro allows for arbitrarily many +// class data members. In the below case, we make a parameter file holding the configuration +// name. Here, it expects the name to be labeled with "conf_name" in the configuration file. +struct ConfParameters: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS( + ConfParameters, + std::string, conf_name); + + template + ConfParameters(Reader& Reader){ + // If we are reading an XML file, it should be structured like: + // + // + // l20t20b06498a_nersc.302500 + // + // + read(Reader, "parameters", *this); + } +}; + + + +// This syntax lets you pass command line arguments to main. An asterisk means that what follows is +// a pointer. Two asterisks means what follows is a pointer to an array. +int main (int argc, char **argv) +{ + // This initializes Grid. Some command line options include + // --mpi n.n.n.n + // --threads n + // --grid n.n.n.n + Grid_init(&argc, &argv); + + // This is where you would specify a custom lattice size, if not from the command line. Here + // Nd is a global quantity that is currently set to 4. + Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + Coordinate latt_size = GridDefaultLatt(); + + // Instantiate the spacetime Grid on which everything will be built. + GridCartesian GRID(latt_size,simd_layout,mpi_layout); + + // The PeriodicGimplD type is what you want for gauge matrices. There is also a LatticeGaugeFieldD + // type that you can use, which will work perfectly with what follows. + PeriodicGimplD::Field U(&GRID); + + // Here we read in the parameter file params.json to get conf_name. The last argument is what the + // top organizational level is called in the param file. + XmlReader Reader("Example_plaquette.xml",false, "grid"); + ConfParameters param(Reader); + + // Load a lattice from SIMULATeQCD into U. SIMULATeQCD finds plaquette = 0.6381995717 + FieldMetaData header; + NerscIO::readConfiguration(U, header, param.conf_name); + + // Let's see what we find. + RealD plaq = WLoops::avgPlaquette(U); + + // This is how you make log messages. + std::cout << GridLogMessage << std::setprecision(std::numeric_limits::digits10 + 1) << "Plaquette = " << plaq << std::endl; + + // To wrap things up. + Grid_finalize(); +} \ No newline at end of file diff --git a/m4/ax_cxx_compile_stdcxx_14.m4 b/m4/ax_cxx_compile_stdcxx_14.m4 new file mode 100644 index 00000000..094db0d0 --- /dev/null +++ b/m4/ax_cxx_compile_stdcxx_14.m4 @@ -0,0 +1,34 @@ +# ============================================================================= +# https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_14.html +# ============================================================================= +# +# SYNOPSIS +# +# AX_CXX_COMPILE_STDCXX_14([ext|noext], [mandatory|optional]) +# +# DESCRIPTION +# +# Check for baseline language coverage in the compiler for the C++14 +# standard; if necessary, add switches to CXX and CXXCPP to enable +# support. +# +# This macro is a convenience alias for calling the AX_CXX_COMPILE_STDCXX +# macro with the version set to C++14. The two optional arguments are +# forwarded literally as the second and third argument respectively. +# Please see the documentation for the AX_CXX_COMPILE_STDCXX macro for +# more information. If you want to use this macro, you also need to +# download the ax_cxx_compile_stdcxx.m4 file. +# +# LICENSE +# +# Copyright (c) 2015 Moritz Klammler +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 5 + +AX_REQUIRE_DEFINED([AX_CXX_COMPILE_STDCXX]) +AC_DEFUN([AX_CXX_COMPILE_STDCXX_14], [AX_CXX_COMPILE_STDCXX([14], [$1], [$2])]) diff --git a/scripts/eigen-3.3.5.Tensor.patch b/scripts/eigen-3.3.5.Tensor.patch deleted file mode 100644 index 54984b94..00000000 --- a/scripts/eigen-3.3.5.Tensor.patch +++ /dev/null @@ -1,19 +0,0 @@ ---- ./Eigen/unsupported/Eigen/CXX11/Tensor 2018-07-23 10:33:42.000000000 +0100 -+++ Tensor 2018-08-28 16:15:56.000000000 +0100 -@@ -25,7 +25,7 @@ - #include - #endif - --#include -+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" - - #include "../SpecialFunctions" - #include "src/util/CXX11Meta.h" -@@ -147,6 +147,6 @@ - - #include "src/Tensor/TensorIO.h" - --#include -+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" - - //#endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/systems/Aurora/benchmarks/bench1024.pbs b/systems/Aurora/benchmarks/bench1024.pbs new file mode 100644 index 00000000..2e99ae4b --- /dev/null +++ b/systems/Aurora/benchmarks/bench1024.pbs @@ -0,0 +1,60 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=1024 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU +export FI_CXI_CQ_FILL_PERCENT=10 +export FI_CXI_DEFAULT_CQ_SIZE=262144 +#export FI_CXI_DEFAULT_CQ_SIZE=131072 +#export FI_CXI_CQ_FILL_PERCENT=20 + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 12288 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.6.16.16 --grid 64.48.64.284 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 12288 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1024node.dwf.small.cq + +CMD="mpiexec -np 12288 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1024node.dwf.cq + + diff --git a/systems/Aurora/benchmarks/bench12.pbs b/systems/Aurora/benchmarks/bench12.pbs new file mode 100644 index 00000000..ee3cb381 --- /dev/null +++ b/systems/Aurora/benchmarks/bench12.pbs @@ -0,0 +1,60 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=2 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +#$CMD + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +#$CMD + +CMD="mpiexec -np 1 -ppn 1 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 1 -ppn 1 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD diff --git a/systems/Aurora/benchmarks/bench2048.pbs b/systems/Aurora/benchmarks/bench2048.pbs new file mode 100644 index 00000000..b79081a2 --- /dev/null +++ b/systems/Aurora/benchmarks/bench2048.pbs @@ -0,0 +1,56 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=2048 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 24576 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.12.16.16 --grid 64.48.64.284 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 24576 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 128.128.128.384 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2048node.dwf.small + +CMD="mpiexec -np 24576 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 256.256.256.768 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2048node.dwf + + diff --git a/systems/Aurora/benchmarks/bench256.pbs b/systems/Aurora/benchmarks/bench256.pbs new file mode 100644 index 00000000..405d9ed4 --- /dev/null +++ b/systems/Aurora/benchmarks/bench256.pbs @@ -0,0 +1,48 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=256 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 3072 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.6.8.8 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 3072 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 8.8.4.12 --grid 128.128.128.768 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 256node.dwf.large diff --git a/systems/Aurora/benchmarks/bench512.pbs b/systems/Aurora/benchmarks/bench512.pbs new file mode 100644 index 00000000..0d8708d3 --- /dev/null +++ b/systems/Aurora/benchmarks/bench512.pbs @@ -0,0 +1,48 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=512 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 6144 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 8.6.8.16 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 6144 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 8.8.8.12 --grid 256.128.128.768 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 512node.dwf.large diff --git a/systems/Aurora/benchmarks/bench_scaling.pbs b/systems/Aurora/benchmarks/bench_scaling.pbs new file mode 100644 index 00000000..504fd3e9 --- /dev/null +++ b/systems/Aurora/benchmarks/bench_scaling.pbs @@ -0,0 +1,80 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=32 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 32 nodes, 384 ranks +# +CMD="mpiexec -np 384 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_comms_host_device --mpi 4.6.4.4 --grid 32.24.32.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" + +$CMD + +CMD="mpiexec -np 12 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 1.2.2.3 --grid 16.64.64.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 1node.dwf + + +CMD="mpiexec -np 24 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 2node.dwf + +CMD="mpiexec -np 48 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.2.6 --grid 32.64.64.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 4node.dwf + +CMD="mpiexec -np 96 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.2.4.6 --grid 32.64.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 8node.dwf + +CMD="mpiexec -np 192 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 2.4.4.6 --grid 32.128.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 16node.dwf + + +CMD="mpiexec -np 384 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Benchmark_dwf_fp32 --mpi 4.4.4.6 --grid 64.128.128.192 \ + --shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" +$CMD | tee 32node.dwf diff --git a/systems/Aurora/benchmarks/gpu_tile_compact.sh b/systems/Aurora/benchmarks/gpu_tile_compact.sh new file mode 100755 index 00000000..5cab1ee3 --- /dev/null +++ b/systems/Aurora/benchmarks/gpu_tile_compact.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) +#export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) +export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) +export NIC_MAP=(0 1 2 4 5 6 0 1 2 4 5 6 ) +export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) +export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) + +export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} +export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]} +export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]} +export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} +export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} + +#export GRID_MPICH_NIC_BIND=$NIC +#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id + +unset EnableWalkerPartition +export EnableImplicitScaling=0 +export ZE_AFFINITY_MASK=$gpu_id.$tile_id +export ONEAPI_DEVICE_FILTER=gpu,level_zero + +#export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 +#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 +#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 +#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 + +#echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " + +numactl -m $NUMA -N $NUMAP "$@" diff --git a/systems/Aurora/benchmarks/gpu_tile_compact4.sh b/systems/Aurora/benchmarks/gpu_tile_compact4.sh new file mode 100755 index 00000000..1928c4a2 --- /dev/null +++ b/systems/Aurora/benchmarks/gpu_tile_compact4.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +export NUMA_MAP=(2 2 3 3 2 2 3 3 ) +export PROC_MAP=(0 0 1 1 0 0 1 1 ) +export NIC_MAP=(0 0 4 4 1 1 5 5 ) +export GPU_MAP=(0 1 3 4 0 1 3 4 ) +export TILE_MAP=(0 0 0 0 1 1 1 1 ) +export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} +export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]} +export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} +export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} + +#export GRID_MPICH_NIC_BIND=$NIC + +unset EnableWalkerPartition +export EnableImplicitScaling=0 +export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 +export ZE_AFFINITY_MASK=$gpu_id.$tile_id +#export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id +export ONEAPI_DEVICE_FILTER=gpu,level_zero +export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 +export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 +#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 + +echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND ; NUMA domain $NUMA" + +numactl -m $NUMA -N $PROC_MAP "$@" diff --git a/systems/Aurora/config-command b/systems/Aurora/config-command new file mode 100644 index 00000000..689747c9 --- /dev/null +++ b/systems/Aurora/config-command @@ -0,0 +1,16 @@ +TOOLS=$HOME/tools +../../configure \ + --enable-simd=GPU \ + --enable-gen-simd-width=64 \ + --enable-comms=mpi-auto \ + --enable-accelerator-cshift \ + --disable-gparity \ + --disable-fermion-reps \ + --enable-shm=nvlink \ + --enable-accelerator=sycl \ + --enable-unified=no \ + MPICXX=mpicxx \ + CXX=icpx \ + LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \ + CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel" + diff --git a/systems/Aurora/proxies.sh b/systems/Aurora/proxies.sh new file mode 100644 index 00000000..ff0d5a5b --- /dev/null +++ b/systems/Aurora/proxies.sh @@ -0,0 +1,9 @@ +export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 +export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 +export http_proxy=http://proxy.alcf.anl.gov:3128 +export https_proxy=http://proxy.alcf.anl.gov:3128 +export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +git config --global http.proxy http://proxy.alcf.anl.gov:3128 +module use /soft/modulefiles +module load intel_compute_runtime/release/agama-devel-682.22 + diff --git a/systems/Aurora/sourceme.sh b/systems/Aurora/sourceme.sh new file mode 100644 index 00000000..8951e43c --- /dev/null +++ b/systems/Aurora/sourceme.sh @@ -0,0 +1,26 @@ +#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 + +module use /soft/modulefiles +module load intel_compute_runtime/release/agama-devel-682.22 + +export FI_CXI_DEFAULT_CQ_SIZE=131072 +export FI_CXI_CQ_FILL_PERCENT=20 + +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" +#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" + +# +# -ftarget-register-alloc-mode=pvc:default +# -ftarget-register-alloc-mode=pvc:small +# -ftarget-register-alloc-mode=pvc:large +# -ftarget-register-alloc-mode=pvc:auto +# + +export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 +export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 +export http_proxy=http://proxy.alcf.anl.gov:3128 +export https_proxy=http://proxy.alcf.anl.gov:3128 +#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 +git config --global http.proxy http://proxy.alcf.anl.gov:3128 + +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" diff --git a/systems/Aurora/tests/repro16.pbs b/systems/Aurora/tests/repro16.pbs new file mode 100644 index 00000000..28030a3d --- /dev/null +++ b/systems/Aurora/tests/repro16.pbs @@ -0,0 +1,40 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=16 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 16 nodes, 192 ranks +CMD="mpiexec -np 192 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \ + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000" +$CMD diff --git a/systems/Aurora/tests/solver/stag16.pbs b/systems/Aurora/tests/solver/stag16.pbs new file mode 100644 index 00000000..5bfe04a6 --- /dev/null +++ b/systems/Aurora/tests/solver/stag16.pbs @@ -0,0 +1,40 @@ +#!/bin/bash + +## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 + +#PBS -q EarlyAppAccess +#PBS -l select=16 +#PBS -l walltime=01:00:00 +#PBS -A LatticeQCD_aesp_CNDA + +#export OMP_PROC_BIND=spread +#unset OMP_PLACES + +cd $PBS_O_WORKDIR + +source ../../sourceme.sh + +cat $PBS_NODEFILE + +export OMP_NUM_THREADS=3 +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 + +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE +#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE +#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST + +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 +export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 +export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 +export MPICH_OFI_NIC_POLICY=GPU + +# 12 ppn, 16 nodes, 192 ranks +CMD="mpiexec -np 192 -ppn 12 -envall \ + ./gpu_tile_compact.sh \ + ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \ + --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000" +$CMD diff --git a/systems/Booster/benchmarks/Benchmark_usqcd.csv b/systems/Booster/benchmarks/Benchmark_usqcd.csv new file mode 100644 index 00000000..68689deb --- /dev/null +++ b/systems/Booster/benchmarks/Benchmark_usqcd.csv @@ -0,0 +1,70 @@ +Memory Bandwidth + +Bytes, GB/s per node +3145728, 225.900365 +50331648, 2858.859504 +254803968, 4145.556367 +805306368, 4905.772480 +1966080000, 4978.312557 + + +GEMM + + M, N, K, BATCH, GF/s per rank +16, 8, 16, 256, 1.713639 +16, 16, 16, 256, 288.268316 +16, 32, 16, 256, 597.053950 +32, 8, 32, 256, 557.382591 +32, 16, 32, 256, 1100.145311 +32, 32, 32, 256, 1885.080449 +64, 8, 64, 256, 1725.163599 +64, 16, 64, 256, 3389.336566 +64, 32, 64, 256, 4168.252422 +16, 8, 256, 256, 1326.262134 +16, 16, 256, 256, 2318.095475 +16, 32, 256, 256, 3555.436503 +32, 8, 256, 256, 1920.139170 +32, 16, 256, 256, 3486.174753 +32, 32, 256, 256, 5320.821724 +64, 8, 256, 256, 2539.597502 +64, 16, 256, 256, 5003.456775 +64, 32, 256, 256, 7837.531562 +8, 256, 16, 256, 1427.848170 +16, 256, 16, 256, 2222.147815 +32, 256, 16, 256, 2877.121715 +8, 256, 32, 256, 1922.890086 +16, 256, 32, 256, 3199.469082 +32, 256, 32, 256, 4845.405343 +8, 256, 64, 256, 2639.483343 +16, 256, 64, 256, 5012.800299 +32, 256, 64, 256, 7216.006882 + + + +Communications + +Packet bytes, direction, GB/s per node +4718592, 2, 206.570734 +4718592, 3, 207.501847 +4718592, 6, 189.730277 +4718592, 7, 204.301218 +15925248, 2, 307.882997 +15925248, 3, 287.901076 +15925248, 6, 295.603109 +15925248, 7, 300.682033 +37748736, 2, 331.740364 +37748736, 3, 338.610627 +37748736, 6, 332.580657 +37748736, 7, 336.336579 + + +Per node summary table + +L , Wilson, DWF4, Staggered, GF/s per node + +8 , 16, 1165, 10 +12 , 473, 4901, 163 +16 , 1436, 8464, 442 +24 , 4133, 10139, 1530 +32 , 5726, 11487, 2518 + diff --git a/systems/Booster/config-command b/systems/Booster/config-command index 8530c5f9..1ba2dc7a 100644 --- a/systems/Booster/config-command +++ b/systems/Booster/config-command @@ -5,10 +5,12 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/ --enable-gen-simd-width=64 \ --enable-shm=nvlink \ --enable-accelerator=cuda \ + --disable-gparity \ + --disable-fermion-reps \ --with-lime=$LIME \ - --disable-accelerator-cshift \ + --enable-accelerator-cshift \ --disable-unified \ CXX=nvcc \ LDFLAGS="-cudart shared " \ - CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared" + CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared -lcublas" diff --git a/systems/Booster/sourceme.sh b/systems/Booster/sourceme.sh index 56499be4..2341267f 100644 --- a/systems/Booster/sourceme.sh +++ b/systems/Booster/sourceme.sh @@ -1,5 +1,5 @@ -module load GCC/9.3.0 -module load GMP/6.2.0 -module load MPFR/4.1.0 -module load OpenMPI/4.1.0rc1 -module load CUDA/11.3 +module load GCC +module load GMP +module load MPFR +module load OpenMPI +module load CUDA diff --git a/systems/Frontier/sourceme.sh b/systems/Frontier/sourceme.sh index 56eec5b5..987241b4 100644 --- a/systems/Frontier/sourceme.sh +++ b/systems/Frontier/sourceme.sh @@ -3,7 +3,7 @@ spack load c-lime #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/sw/crusher/spack-envs/base/opt/cray-sles15-zen3/gcc-11.2.0/gperftools-2.9.1-72ubwtuc5wcz2meqltbfdb76epufgzo2/lib module load emacs module load PrgEnv-gnu -module load rocm/5.3.0 +module load rocm module load cray-mpich/8.1.23 module load gmp module load cray-fftw diff --git a/systems/Lumi/HMC/32cube/fthmc3gev.slurm b/systems/Lumi/HMC/32cube/fthmc3gev.slurm new file mode 100644 index 00000000..4cdc5136 --- /dev/null +++ b/systems/Lumi/HMC/32cube/fthmc3gev.slurm @@ -0,0 +1,57 @@ +#!/bin/bash -l +#SBATCH --job-name=fthmc3ge +#SBATCH --partition=small-g +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +##SBATCH --cpus-per-task=8 +#SBATCH --gpus-per-node=8 +#SBATCH --time=2:00:00 +#SBATCH --account=project_465000546 +#SBATCH --gpu-bind=none +#SBATCH --exclusive +#SBATCH --mem=0 + + +#sbatch --dependency=afterany:$SLURM_JOBID fthmc3gev.slurm + +CPU_BIND="map_ldom:3,3,1,1,0,0,2,2" +MEM_BIND="map_mem:3,3,1,1,0,0,2,2" +echo $CPU_BIND + +cat << EOF > ./select_gpu +#!/bin/bash +export GPU_MAP=(0 1 2 3 4 5 6 7) +export NUMA_MAP=(3 3 1 1 0 0 2 2) +export GPU=\${GPU_MAP[\$SLURM_LOCALID]} +export NUM=\${NUMA_MAP[\$SLURM_LOCALID]} +#export HIP_VISIBLE_DEVICES=\$GPU +export ROCR_VISIBLE_DEVICES=\$GPU +echo RANK \$SLURM_LOCALID using GPU \$GPU +echo NUMA \$SLURM_LOCALID using NUMA \${NUM} +echo numactl -m \$NUM -N \$NUM \$* +exec numactl -m \$NUM -N \$NUM \$* +EOF +cat ./select_gpu + +chmod +x ./select_gpu + +root=/scratch/project_465000546/boylepet/Grid/systems/Lumi +source ${root}/sourceme.sh + +export OMP_NUM_THREADS=7 +export MPICH_SMP_SINGLE_COPY_MODE=CMA +export MPICH_GPU_SUPPORT_ENABLED=1 + +#cfg=`ls -rt ckpoint_*lat* | tail -n 1 ` +#traj="${cfg#*.}" +#cfg=`ls -rt ckpoint_*lat* | tail -n 1 ` +traj=0 + +vol=32.32.32.64 +mpi=1.2.2.2 +PARAMS="--mpi $mpi --accelerator-threads 16 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol" +#HMCPARAMS="--StartingType CheckpointStart --StartingTrajectory $traj --Trajectories 200" +HMCPARAMS="--StartingType ColdStart --StartingTrajectory $traj --Trajectories 20" + +srun ./select_gpu ../FTHMC2p1f_3GeV $HMCPARAMS $PARAMS + diff --git a/systems/Lumi/config-command b/systems/Lumi/config-command index 3f7877c8..5e596285 100644 --- a/systems/Lumi/config-command +++ b/systems/Lumi/config-command @@ -23,7 +23,7 @@ echo mpfr X$MPFR --disable-fermion-reps \ --disable-gparity \ CXX=hipcc MPICXX=mpicxx \ - CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++14 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \ + CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++17 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \ LDFLAGS="-L/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/lib -lmpi -L/opt/cray/pe/mpich/8.1.23/gtl/lib -lmpi_gtl_hsa -lamdhip64 -fopenmp" diff --git a/systems/OEM/setup.sh b/systems/OEM/setup.sh deleted file mode 100644 index 3b8188f0..00000000 --- a/systems/OEM/setup.sh +++ /dev/null @@ -1,3 +0,0 @@ -export https_proxy=http://proxy-chain.intel.com:911 -module load intel-release -module load intel/mpich diff --git a/systems/OEM/README b/systems/PVC-OEM/README similarity index 100% rename from systems/OEM/README rename to systems/PVC-OEM/README diff --git a/systems/OEM/benchmarks/bench.sh b/systems/PVC-OEM/benchmarks/bench.sh similarity index 100% rename from systems/OEM/benchmarks/bench.sh rename to systems/PVC-OEM/benchmarks/bench.sh diff --git a/systems/OEM/benchmarks/select_gpu.sh b/systems/PVC-OEM/benchmarks/select_gpu.sh similarity index 62% rename from systems/OEM/benchmarks/select_gpu.sh rename to systems/PVC-OEM/benchmarks/select_gpu.sh index 2ef1f82d..db1cfc85 100755 --- a/systems/OEM/benchmarks/select_gpu.sh +++ b/systems/PVC-OEM/benchmarks/select_gpu.sh @@ -1,9 +1,8 @@ #!/bin/bash num_tile=2 - -gpu_id=$(( (MPI_LOCAL_RANKID % num_tile ) )) -tile_id=$((MPI_LOCAL_RANKID / num_tile)) +gpu_id=$(( (MPI_LOCALRANKID / num_tile ) )) +tile_id=$((MPI_LOCALRANKID % num_tile)) export ZE_AFFINITY_MASK=$gpu_id.$tile_id diff --git a/systems/OEM/config-command b/systems/PVC-OEM/config-command similarity index 100% rename from systems/OEM/config-command rename to systems/PVC-OEM/config-command diff --git a/systems/PVC-OEM/setup.sh b/systems/PVC-OEM/setup.sh new file mode 100644 index 00000000..0e780ef4 --- /dev/null +++ b/systems/PVC-OEM/setup.sh @@ -0,0 +1,5 @@ +export https_proxy=http://proxy-chain.intel.com:911 +module load intel-release +module load intel/mpich +export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 +export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" diff --git a/systems/PVC/benchmarks/run-1tile.sh b/systems/PVC/benchmarks/run-1tile.sh deleted file mode 100755 index 9a29b773..00000000 --- a/systems/PVC/benchmarks/run-1tile.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/sh -##SBATCH -p PVC-SPR-QZEH -##SBATCH -p PVC-ICX-QZNW -#SBATCH -p QZ1J-ICX-PVC -##SBATCH -p QZ1J-SPR-PVC-2C - -#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh - -export NT=8 - -export I_MPI_OFFLOAD=1 -export I_MPI_OFFLOAD_TOPOLIB=level_zero -export I_MPI_OFFLOAD_DOMAIN_SIZE=-1 - -# export IGC_EnableLSCFenceUGMBeforeEOT=0 -# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False" -export SYCL_DEVICE_FILTER=gpu,level_zero -#export IGC_ShaderDumpEnable=1 -#export IGC_DumpToCurrentDir=1 -export I_MPI_OFFLOAD_CELL=tile -export EnableImplicitScaling=0 -export EnableWalkerPartition=0 -export ZE_AFFINITY_MASK=0.0 -mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --device-mem 32768 - -export ZE_AFFINITY_MASK=0 -export I_MPI_OFFLOAD_CELL=device -export EnableImplicitScaling=1 -export EnableWalkerPartition=1 - - - - - - - - - - - - - - - - - - - - -#mpiexec -launcher ssh -n 2 -host localhost vtune -collect gpu-hotspots -knob gpu-sampling-interval=1 -data-limit=0 -r ./vtune_run4 -- ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1 - -#mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1 - -#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 - -#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-overlap --shm-mpi 1 - -#mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 - -#mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 16.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 0 -#mpirun -np 2 ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --comms-sequential --shm-mpi 1 - diff --git a/systems/PVC/benchmarks/run-2tile-mpi.sh b/systems/PVC/benchmarks/run-2tile-mpi.sh deleted file mode 100755 index 1db67508..00000000 --- a/systems/PVC/benchmarks/run-2tile-mpi.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -##SBATCH -p PVC-SPR-QZEH -##SBATCH -p PVC-ICX-QZNW - -#SBATCH -p QZ1J-ICX-PVC - -#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh - -export NT=16 - -# export IGC_EnableLSCFenceUGMBeforeEOT=0 -# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False" -#export IGC_ShaderDumpEnable=1 -#export IGC_DumpToCurrentDir=1 -export I_MPI_OFFLOAD=1 -export I_MPI_OFFLOAD_TOPOLIB=level_zero -export I_MPI_OFFLOAD_DOMAIN_SIZE=-1 -export SYCL_DEVICE_FILTER=gpu,level_zero -export I_MPI_OFFLOAD_CELL=tile -export EnableImplicitScaling=0 -export EnableWalkerPartition=0 -#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 -#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0 - -for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 -do -mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 1.1.1.2.log$i -mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 2.1.1.1.log$i -done - -mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 - diff --git a/systems/PVC/benchmarks/wrap.sh b/systems/PVC/benchmarks/wrap.sh deleted file mode 100755 index b8806b30..00000000 --- a/systems/PVC/benchmarks/wrap.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/sh - -export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID - -echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK - - - $@ - diff --git a/systems/PVC/config-command b/systems/PVC/config-command deleted file mode 100644 index c3523c2d..00000000 --- a/systems/PVC/config-command +++ /dev/null @@ -1,16 +0,0 @@ -INSTALL=/nfs/site/home/paboylx/prereqs/ -../../configure \ - --enable-simd=GPU \ - --enable-gen-simd-width=64 \ - --enable-comms=mpi-auto \ - --disable-accelerator-cshift \ - --disable-gparity \ - --disable-fermion-reps \ - --enable-shm=nvlink \ - --enable-accelerator=sycl \ - --enable-unified=no \ - MPICXX=mpicxx \ - CXX=dpcpp \ - LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \ - CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare" - diff --git a/systems/PVC/setup.sh b/systems/PVC/setup.sh deleted file mode 100644 index c3b97ce0..00000000 --- a/systems/PVC/setup.sh +++ /dev/null @@ -1,18 +0,0 @@ -export https_proxy=http://proxy-chain.intel.com:911 -#export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH -export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH - -module load intel-release -module load intel-comp-rt/embargo-ci-neo - -#source /opt/intel/oneapi/PVC_setup.sh -#source /opt/intel/oneapi/ATS_setup.sh -#module load intel-nightly/20230331 -#module load intel-comp-rt/ci-neo-master/026093 - -#module load intel/mpich -module load intel/mpich/pvc45.3 -export PATH=~/ATS/pti-gpu/tools/onetrace/:$PATH - -#clsh embargo-ci-neo-022845 -#source /opt/intel/vtune_amplifier/amplxe-vars.sh diff --git a/systems/SDCC-A100/bench.slurm b/systems/SDCC-A100/bench.slurm new file mode 100644 index 00000000..04d1e1e2 --- /dev/null +++ b/systems/SDCC-A100/bench.slurm @@ -0,0 +1,42 @@ +#!/bin/bash +#SBATCH --partition csi +#SBATCH --time=00:10:00 +#SBATCH -A csigeneral +#SBATCH --exclusive +#SBATCH --nodes=1 +#SBATCH --ntasks=4 +#SBATCH --qos csi +#SBATCH --gres=gpu:4 + +source sourceme.sh + +cat << EOF > select_gpu +#!/bin/bash +export GPU_MAP=(0 1 2 3) +export GPU=\${GPU_MAP[\$SLURM_LOCALID]} +export CUDA_VISIBLE_DEVICES=\$GPU +unset ROCR_VISIBLE_DEVICES +echo RANK \$SLURM_LOCALID using GPU \$GPU +exec \$* +EOF +chmod +x ./select_gpu + + +export OMP_NUM_THREADS=4 +export OMPI_MCA_btl=^uct,openib +export UCX_TLS=cuda,gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc +export UCX_RNDV_SCHEME=put_zcopy +export UCX_RNDV_THRESH=16384 +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_MEMTYPE_CACHE=n + +export OMP_NUM_THREAD=8 +#srun -N1 -n1 nvidia-smi +#srun -N1 -n1 numactl -H > numa.txt +srun -N1 -n1 lstopo A100-topo.pdf + +# 4.35 TF/s +#srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --shm 2048 --shm-mpi 0 --accelerator-threads 16 + +srun -N1 -n4 ./select_gpu ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 32.32.64.64 --shm 2048 --shm-mpi 0 --accelerator-threads 16 + diff --git a/systems/SDCC-A100/config-command b/systems/SDCC-A100/config-command new file mode 100644 index 00000000..26ad5377 --- /dev/null +++ b/systems/SDCC-A100/config-command @@ -0,0 +1,17 @@ +../../configure \ +--enable-comms=mpi-auto \ +--enable-unified=no \ +--enable-shm=nvlink \ +--enable-accelerator=cuda \ +--enable-gen-simd-width=64 \ +--enable-simd=GPU \ +--disable-accelerator-cshift \ +--disable-fermion-reps \ +--disable-gparity \ +CXX=nvcc \ +MPICXX=mpicxx \ +LDFLAGS="-cudart shared " \ +CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared" + + + diff --git a/systems/SDCC-A100/sourceme.sh b/systems/SDCC-A100/sourceme.sh new file mode 100644 index 00000000..2aa86b7e --- /dev/null +++ b/systems/SDCC-A100/sourceme.sh @@ -0,0 +1,2 @@ +module load cuda/12.2 +module load openmpi diff --git a/systems/SDCC-ARM/config-command-mpi b/systems/SDCC-ARM/config-command-mpi new file mode 100644 index 00000000..882cfe56 --- /dev/null +++ b/systems/SDCC-ARM/config-command-mpi @@ -0,0 +1,6 @@ +HDF=$HOME/paboyle/install + +LDFLAGS=-L$HDF/lib CXX=clang++ ../../configure --enable-simd=NEONv8 --enable-comms=none --enable-unified=yes --disable-fermion-reps --disable-gparity --disable-debug --with-hdf5=$HDF +#LDFLAGS=-L$HDF/lib CXX=clang++ ../../configure --enable-simd=GEN --enable-comms=none --enable-unified=yes --disable-fermion-reps --disable-gparity --disable-debug --with-hdf5=$HDF + + diff --git a/systems/SDCC-ICE/bench.slurm b/systems/SDCC-ICE/bench.slurm new file mode 100644 index 00000000..76beb828 --- /dev/null +++ b/systems/SDCC-ICE/bench.slurm @@ -0,0 +1,31 @@ +#!/bin/bash +#SBATCH --partition lqcd +#SBATCH --time=00:20:00 +#SBATCH -A lqcdtest +#SBATCH --exclusive +#SBATCH --nodes=1 +#SBATCH --ntasks=2 +#SBATCH --qos lqcd + +source sourceme.sh + +export OMP_NUM_THREAD=24 +#srun -N1 -n1 numactl -H > numa.txt +#srun -N1 -n1 lstopo ice-topo.pdf + +cat << EOF > select_socket +#!/bin/bash +export NUM_MAP=(0 1) +export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]} +exec \$* +EOF +chmod +x ./select_socket + +#for vol in 8.8.8.16 8.8.8.32 8.8.8.64 +#for vol in 8.8.16.16 8.8.16.32 8.8.16.64 +for vol in 8.16.16.16 8.16.16.32 8.16.16.64 16.16.16.32 16.16.16.64 24.24.24.64 32.32.32.32 +do +srun --cpu-bind=ldoms -N1 -n2 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid $vol --dslash-asm > $vol.2socket.out +srun --cpu-bind=ldoms -N1 -n1 ./select_socket ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm > $vol.1socket.out +done + diff --git a/systems/SDCC-ICE/config-command b/systems/SDCC-ICE/config-command new file mode 100644 index 00000000..bc28c96d --- /dev/null +++ b/systems/SDCC-ICE/config-command @@ -0,0 +1,19 @@ +../../configure \ +--enable-debug \ +--enable-comms=mpi-auto \ +--enable-unified=yes \ +--enable-shm=shmopen \ +--enable-shm-fast-path=shmopen \ +--enable-accelerator=none \ +--enable-simd=AVX512 \ +--disable-accelerator-cshift \ +--disable-fermion-reps \ +--disable-gparity \ +CXX=clang++ \ +MPICXX=mpicxx \ +LDFLAGS=-L/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/hwloc-2.9.1-hgkscnt5pferhtde4ahctlupb6qf3vtl/lib/ \ +LIBS=-lhwloc \ +CXXFLAGS="-std=c++17" + + + diff --git a/systems/SDCC-ICE/sourceme.sh b/systems/SDCC-ICE/sourceme.sh new file mode 100644 index 00000000..6263063c --- /dev/null +++ b/systems/SDCC-ICE/sourceme.sh @@ -0,0 +1,2 @@ +export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-12.0.1-agey6vtuw3e375rewhhobvkznjh5ltz4/lib/:$LD_LIBRARY_PATH +module load openmpi diff --git a/systems/Sunspot/benchmarks/bench.pbs b/systems/Sunspot/benchmarks/bench.pbs index f1a26aa4..dc07ca2f 100644 --- a/systems/Sunspot/benchmarks/bench.pbs +++ b/systems/Sunspot/benchmarks/bench.pbs @@ -20,7 +20,7 @@ unset OMP_PLACES cd $PBS_O_WORKDIR -qsub jobscript.pbs +#qsub jobscript.pbs echo Jobid: $PBS_JOBID echo Running on host `hostname` @@ -44,3 +44,4 @@ CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -enva ./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \ --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32" +$CMD diff --git a/systems/Sunspot/benchmarks/gpu_tile_compact.sh b/systems/Sunspot/benchmarks/gpu_tile_compact.sh index ec532b1b..11ed83aa 100755 --- a/systems/Sunspot/benchmarks/gpu_tile_compact.sh +++ b/systems/Sunspot/benchmarks/gpu_tile_compact.sh @@ -45,8 +45,8 @@ echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_A if [ $PALS_LOCAL_RANKID = 0 ] then - onetrace --chrome-device-timeline "$@" -# "$@" +# onetrace --chrome-device-timeline "$@" + "$@" else "$@" fi diff --git a/systems/Sunspot/config-command b/systems/Sunspot/config-command index 7adf7117..e59ef515 100644 --- a/systems/Sunspot/config-command +++ b/systems/Sunspot/config-command @@ -11,6 +11,6 @@ TOOLS=$HOME/tools --enable-unified=no \ MPICXX=mpicxx \ CXX=icpx \ - LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -lapmidg -L$TOOLS/lib64/" \ + LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \ CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include" diff --git a/systems/mac-arm/config-command-mpi b/systems/mac-arm/config-command-mpi index 149bee17..be4d23dc 100644 --- a/systems/mac-arm/config-command-mpi +++ b/systems/mac-arm/config-command-mpi @@ -1,3 +1,2 @@ -BREW=/opt/local/ -MPICXX=mpicxx ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug +CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ CXX=c++-13 MPICXX=mpicxx ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index cbc573d1..13cc0bb6 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -30,27 +30,16 @@ Author: Peter Boyle using namespace std; using namespace Grid; -template -struct scal { - d internal; -}; - - Gamma::Algebra Gmu [] = { - Gamma::Algebra::GammaX, - Gamma::Algebra::GammaY, - Gamma::Algebra::GammaZ, - Gamma::Algebra::GammaT - }; - int main (int argc, char ** argv) { + char hostname[HOST_NAME_MAX+1]; + gethostname(hostname, HOST_NAME_MAX+1); + std::string host(hostname); + Grid_init(&argc,&argv); const int Ls=12; - std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl; - - { GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi()); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); @@ -92,7 +81,14 @@ int main (int argc, char ** argv) SchurDiagMooeeOperator HermOpEO(Ddwf); SchurDiagMooeeOperator HermOpEO_f(Ddwf_f); - std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl; + int nsecs=600; + if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){ + std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds"); + GridCmdOptionInt(arg,nsecs); + } + + std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "< mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO); double t1,t2,flops; double MdagMsiteflops = 1452; // Mobius (real coeffs) @@ -101,7 +97,14 @@ int main (int argc, char ** argv) std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<gSites()*iters; std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< CG(1.0e-8,10000); - for(int i=0;i<1;i++){ + csumref=0; + int i=0; + do { + std::cerr << "******************* DOUBLE PRECISION SOLVE "<gSites()*iters; flops+= CGsiteflops*FrbGrid->gSites()*iters; - + std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.< munge; - std::string format = getFormatString(); - - BinaryIO::writeLatticeObject(result_o,file1,munge, 0, format, - nersc_csum,scidac_csuma,scidac_csumb); - - std::cout << GridLogMessage << " Mixed checksums "<(result_o_2,file1,munge, 0, format, - nersc_csum,scidac_csuma,scidac_csumb); - - std::cout << GridLogMessage << " CG checksums "< + +template inline void sliceSumCPU(const Grid::Lattice &Data,std::vector &result,int orthogdim) +{ + using namespace Grid; + /////////////////////////////////////////////////////// + // FIXME precision promoted summation + // may be important for correlation functions + // But easily avoided by using double precision fields + /////////////////////////////////////////////////////// + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_object::scalar_type scalar_type; + GridBase *grid = Data.Grid(); + assert(grid!=NULL); + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + assert(orthogdim >= 0); + assert(orthogdim < Nd); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + Vector lvSum(rd); // will locally sum vectors first + Vector lsSum(ld,Zero()); // sum across these down to scalars + ExtractBuffer extracted(Nsimd); // splitting the SIMD + + result.resize(fd); // And then global sum to return the same vector to every node + for(int r=0;r_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + int ostride=grid->_ostride[orthogdim]; + + //Reduce Data down to lvSum + sliceSumReduction_cpu(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd); + + // Sum across simd lanes in the plane, breaking out orthog dir. + Coordinate icoor(Nd); + + for(int rt=0;rtiCoorFromIindex(icoor,idx); + + int ldx =rt+icoor[orthogdim]*rd; + + lsSum[ldx]=lsSum[ldx]+extracted[idx]; + + } + } + + // sum over nodes. + for(int t=0;t_processor_coor[orthogdim] ) { + result[t]=lsSum[lt]; + } else { + result[t]=Zero(); + } + + } + scalar_type * ptr = (scalar_type *) &result[0]; + int words = fd*sizeof(sobj)/sizeof(scalar_type); + grid->GlobalSumVector(ptr, words); +} + + +int main (int argc, char ** argv) { + + using namespace Grid; + + Grid_init(&argc,&argv); + + + Coordinate latt_size({64,64,64,16}); + auto simd_layout = GridDefaultSimd(Nd, vComplexD::Nsimd()); + auto mpi_layout = GridDefaultMpi(); + GridCartesian Grid(latt_size, simd_layout, mpi_layout); + + std::vector seeds({1, 2, 3, 4}); + + GridParallelRNG pRNG(&Grid); + pRNG.SeedFixedIntegers(seeds); + + LatticeComplexD test_data(&Grid); + gaussian(pRNG,test_data); + + std::vector reduction_reference; + std::vector reduction_result; + + //warmup + for (int sweeps = 0; sweeps < 5; sweeps++) { + reduction_result = sliceSum(test_data,0); + } + + int trace_id = traceStart("sliceSum benchmark - ComplexD"); + std::cout << GridLogMessage << "Testing ComplexD" << std::endl; + std::cout << GridLogMessage << "sizeof(ComplexD) = " << sizeof(ComplexD) << std::endl; + std::cout << GridLogMessage << "sizeof(vComplexD) = " << sizeof(vComplexD) << std::endl; + for (int i = 0; i < Nd; i++) { + + RealD t=-usecond(); + + tracePush("sliceSum"); + sliceSumCPU(test_data,reduction_reference,i); + tracePop("sliceSum"); + + t+=usecond(); + std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl; + std::cout << GridLogMessage << "CPU sliceSum took "< reduction_reference_cv; + std::vector reduction_result_cv; + + //warmup + for (int sweeps = 0; sweeps < 5; sweeps++) { + reduction_result_cv = sliceSum(test_data_cv,0); + } + trace_id = traceStart("sliceSum benchmark - SpinVectorD"); + + std::cout << GridLogMessage << "Testing SpinVectorD" << std::endl; + std::cout << GridLogMessage << "sizeof(SpinVectorD) = " << sizeof(SpinVectorD) << std::endl; + std::cout << GridLogMessage << "sizeof(vSpinVectorD) = " << sizeof(vSpinVectorD) << std::endl; + for (int i = 0; i < Nd; i++) { + + RealD t=-usecond(); + + tracePush("sliceSum"); + sliceSumCPU(test_data_cv,reduction_reference_cv,i); + tracePop("sliceSum"); + + t+=usecond(); + std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl; + std::cout << GridLogMessage << "CPU sliceSum took "< reduction_reference_scv; + std::vector reduction_result_scv; + + //warmup + for (int sweeps = 0; sweeps < 5; sweeps++) { + reduction_result_scv = sliceSum(test_data_scv,0); + } + trace_id = traceStart("sliceSum benchmark - SpinColourVectorD"); + + std::cout << GridLogMessage << "Testing SpinColourVectorD" << std::endl; + std::cout << GridLogMessage << "sizeof(SpinColourVectorD) = " << sizeof(SpinColourVectorD) << std::endl; + std::cout << GridLogMessage << "sizeof(vSpinColourVectorD) = " << sizeof(vSpinColourVectorD) << std::endl; + for (int i = 0; i < Nd; i++) { + + RealD t=-usecond(); + + tracePush("sliceSum"); + sliceSumCPU(test_data_scv,reduction_reference_scv,i); + tracePop("sliceSum"); + + t+=usecond(); + std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl; + std::cout << GridLogMessage << "CPU sliceSum took "< reduction_reference_scm; + std::vector reduction_result_scm; + + //warmup + for (int sweeps = 0; sweeps < 5; sweeps++) { + reduction_result_scm = sliceSum(test_data_scm,0); + } + trace_id = traceStart("sliceSum benchmark - SpinColourMatrixD"); + + std::cout << GridLogMessage << "Testing SpinColourMatrixD" << std::endl; + std::cout << GridLogMessage << "sizeof(SpinColourMatrixD) = " << sizeof(SpinColourMatrixD) << std::endl; + std::cout << GridLogMessage << "sizeof(vSpinColourMatrixD) = " << sizeof(vSpinColourMatrixD) << std::endl; + for (int i = 0; i < Nd; i++) { + + RealD t=-usecond(); + + tracePush("sliceSum"); + sliceSumCPU(test_data_scm,reduction_reference_scm,i); + tracePop("sliceSum"); + + t+=usecond(); + std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl; + std::cout << GridLogMessage << "CPU sliceSum took "< using namespace std; using namespace Grid; +// This is to optimize the SIMD template void gpermute(vobj & inout,int perm){ vobj tmp=inout; if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;} @@ -39,7 +40,8 @@ template void gpermute(vobj & inout,int perm){ if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;} if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;} } - + + int main (int argc, char ** argv) { Grid_init(&argc,&argv); @@ -47,20 +49,21 @@ int main (int argc, char ** argv) Coordinate latt_size = GridDefaultLatt(); Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd()); Coordinate mpi_layout = GridDefaultMpi(); - std::cout << " mpi "<({45,12,81,9})); LatticeGaugeField Umu(&GRID); - + pRNG.SeedFixedIntegers(std::vector({45,12,81,9})); SU::HotConfiguration(pRNG,Umu); Real plaq=WilsonLoops::avgPlaquette(Umu); LatticeComplex trplaq(&GRID); + // Store Umu in U. Peek/Poke mean respectively getElement/setElement. std::vector U(Nd, Umu.Grid()); for (int mu = 0; mu < Nd; mu++) { U[mu] = PeekIndex(Umu, mu); @@ -70,9 +73,7 @@ int main (int argc, char ** argv) LatticeComplex cplaq(&GRID); cplaq=Zero(); - ///////////////////////////////////////////////// // Create a padded cell of extra padding depth=1 - ///////////////////////////////////////////////// int depth = 1; PaddedCell Ghost(depth,&GRID); LatticeGaugeField Ughost = Ghost.Exchange(Umu); @@ -114,18 +115,25 @@ int main (int argc, char ** argv) } #endif - ///// Array for the site plaquette + // Array for the site plaquette GridBase *GhostGrid = Ughost.Grid(); LatticeComplex gplaq(GhostGrid); - + + // Now we're going to put together the "stencil" that will be useful to us when + // calculating the plaquette. Our eventual goal is to make the product + // Umu(x) Unu(x+mu) Umu^dag(x+nu) Unu^dag(x), + // which requires, in order, the sites x, x+mu, x+nu, and x. We arrive at these + // sites relative to x through "shifts", which is represented here by a 4-d + // vector of 0s (no movement) and 1s (shift one unit) at each site. The + // "stencil" is the set of all these shifts. std::vector shifts; for(int mu=0;mu_offset; - int o1 = SE1->_offset; - int o2 = SE2->_offset; - int o3 = SE3->_offset; - - auto U0 = U_v[o0](mu); - auto U1 = U_v[o1](nu); - auto U2 = adj(U_v[o2](mu)); - auto U3 = adj(U_v[o3](nu)); + // Before doing accelerator stuff, there is an opening and closing of "Views". I guess the + // "Views" are stored in *_v variables listed below. + autoView( gp_v , gplaq, CpuWrite); + autoView( t_v , trplaq, CpuRead); + autoView( U_v , Ughost, CpuRead); - gpermute(U0,SE0->_permute); - gpermute(U1,SE1->_permute); - gpermute(U2,SE2->_permute); - gpermute(U3,SE3->_permute); - - gp_v[ss]() =gp_v[ss]() + trace( U0*U1*U2*U3 ); - s=s+4; - } - } + // This is now a loop over stencil shift elements. That is, s increases as we make our + // way through the spacetimes sites, but also as we make our way around the plaquette. + for(int ss=0;ss_offset; + int o1 = SE1->_offset; + int o2 = SE2->_offset; + int o3 = SE3->_offset; + + auto U0 = U_v[o0](mu); + auto U1 = U_v[o1](nu); + auto U2 = adj(U_v[o2](mu)); + auto U3 = adj(U_v[o3](nu)); + + gpermute(U0,SE0->_permute); + gpermute(U1,SE1->_permute); + gpermute(U2,SE2->_permute); + gpermute(U3,SE3->_permute); + + gp_v[ss]() =gp_v[ss]() + trace( U0*U1*U2*U3 ); + s=s+4; + } } } + + // Here is my understanding of this part: The padded cell has its own periodic BCs, so + // if I take a step to the right at the right-most side of the cell, I end up on the + // left-most side. This means that the plaquettes in the padding are wrong. Luckily + // all we care about are the plaquettes in the cell, which we obtain from Extract. cplaq = Ghost.Extract(gplaq); RealD vol = cplaq.Grid()->gSites(); RealD faces = (Nd * (Nd-1))/2; diff --git a/tests/smearing/Test_fatLinks.cc b/tests/smearing/Test_fatLinks.cc new file mode 100644 index 00000000..04d5b165 --- /dev/null +++ b/tests/smearing/Test_fatLinks.cc @@ -0,0 +1,181 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./tests/smearing/Test_fatLinks.cc + +Copyright (C) 2023 + +Author: D. A. Clarke + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* + @file Test_fatLinks.cc + @brief test of the HISQ smearing +*/ + + +#include +#include +#include +#include +using namespace Grid; + + +/*! @brief parameter file to easily adjust Nloop */ +struct ConfParameters: Serializable { + GRID_SERIALIZABLE_CLASS_MEMBERS( + ConfParameters, + int, benchmark, + int, Nloop); + + template + ConfParameters(Reader& Reader){ + read(Reader, "parameters", *this); + } +}; + + +bool testSmear(GridCartesian& GRID, LatticeGaugeFieldD Umu, LatticeGaugeFieldD Usmr, LatticeGaugeFieldD Unaik, + LatticeGaugeFieldD Ucontrol, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) { + Smear_HISQ hisq_fat(&GRID,c1,cnaik,c3,c5,c7,clp); + LatticeGaugeFieldD diff(&GRID), Uproj(&GRID); + hisq_fat.smear(Usmr, Unaik, Umu); + bool result; + if (cnaik < 1e-30) { // Testing anything but Naik term + diff = Ucontrol-Usmr; + auto absDiff = norm2(diff)/norm2(Ucontrol); + if (absDiff < 1e-30) { + Grid_pass(" |Umu-Usmr|/|Umu| = ",absDiff); + result = true; + } else { + Grid_error(" |Umu-Usmr|/|Umu| = ",absDiff); + result = false; + } + } else { // Testing Naik specifically + diff = Ucontrol-Unaik; + auto absDiff = norm2(diff)/norm2(Ucontrol); + if (absDiff < 1e-30) { + Grid_pass(" |Umu-Unaik|/|Umu| = ",absDiff); + result = true; + } else { + Grid_error(" |Umu-Unaik|/|Umu| = ",absDiff); + result = false; + } + hisq_fat.projectU3(Uproj,Ucontrol); +// NerscIO::writeConfiguration(Unaik,"nersc.l8t4b3360.naik"); + } + return result; +} + + +int main (int argc, char** argv) { + + // Params for the test. + int Ns = 8; + int Nt = 4; + Coordinate latt_size(Nd,0); latt_size[0]=Ns; latt_size[1]=Ns; latt_size[2]=Ns; latt_size[3]=Nt; + std::string conf_in = "nersc.l8t4b3360"; + int threads = GridThread::GetThreads(); + + typedef LatticeGaugeFieldD LGF; + + // Initialize the Grid + Grid_init(&argc,&argv); + Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + Grid_log("mpi = ",mpi_layout); + Grid_log("simd = ",simd_layout); + Grid_log("latt = ",latt_size); + Grid_log("threads = ",threads); + GridCartesian GRID(latt_size,simd_layout,mpi_layout); + + XmlReader Reader("fatParams.xml",false,"grid"); + ConfParameters param(Reader); + if(param.benchmark) Grid_log(" Nloop = ",param.Nloop); + + LGF Umu(&GRID), Usmr(&GRID), Unaik(&GRID), Ucontrol(&GRID); + + // Read the configuration into Umu + FieldMetaData header; + NerscIO::readConfiguration(Umu, header, conf_in); + + bool pass=true; + + // Carry out various tests + NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.357lplink.control"); + pass *= testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,1/64.,1/384.,-1/8.); + NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.357link.control"); + pass *= testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,1/64.,1/384.,0.); + NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.35link.control"); + pass *= testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,1/64.,0.,0.); + NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.3link.control"); + pass *= testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,0.,0.,0.); + NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.naik.control"); + pass *= testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,0.,0.8675309,0.,0.,0.,0.); + + if(pass){ + Grid_pass("All tests passed."); + } else { + Grid_error("At least one test failed."); + } + + // Test a C-style instantiation + double path_coeff[6] = {1, 2, 3, 4, 5, 6}; + Smear_HISQ hisq_fat_Cstyle(&GRID,path_coeff); + + if (param.benchmark) { + + autoView(U_v, Umu, CpuRead); // Gauge accessor + + // Read in lattice sequentially, Nloop times + double lookupTime = 0.; + for(int i=0;i