mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-25 10:09:34 +01:00 
			
		
		
		
	Compare commits
	
		
			11 Commits
		
	
	
		
			7f9d06f339
			...
			rmhmc_merg
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | cfa0576ffd | ||
|  | fe98e9f555 | ||
|  | 948d16fb06 | ||
|  | 58fbcaa399 | ||
|  | 9ad6836b0f | ||
|  | 026eb8a695 | ||
|  | 076580c232 | ||
|  | 7af6022a2a | ||
|  | 982a60536c | ||
|  | dc36d272ce | ||
|  | 515ff6bf62 | 
							
								
								
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,7 +1,3 @@ | |||||||
| # Doxygen stuff |  | ||||||
| html/* |  | ||||||
| latex/* |  | ||||||
|  |  | ||||||
| # Compiled Object files # | # Compiled Object files # | ||||||
| ######################### | ######################### | ||||||
| *.slo | *.slo | ||||||
|   | |||||||
| @@ -34,7 +34,7 @@ | |||||||
| #pragma push_macro("__SYCL_DEVICE_ONLY__") | #pragma push_macro("__SYCL_DEVICE_ONLY__") | ||||||
| #undef __SYCL_DEVICE_ONLY__ | #undef __SYCL_DEVICE_ONLY__ | ||||||
| #define EIGEN_DONT_VECTORIZE | #define EIGEN_DONT_VECTORIZE | ||||||
| #undef EIGEN_USE_SYCL | //#undef EIGEN_USE_SYCL | ||||||
| #define __SYCL__REDEFINE__ | #define __SYCL__REDEFINE__ | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|   | |||||||
| @@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #define _GRID_FFT_H_ | #define _GRID_FFT_H_ | ||||||
|  |  | ||||||
| #ifdef HAVE_FFTW | #ifdef HAVE_FFTW | ||||||
| #if defined(USE_MKL) || defined(GRID_SYCL) | #ifdef USE_MKL | ||||||
| #include <fftw/fftw3.h> | #include <fftw/fftw3.h> | ||||||
| #else | #else | ||||||
| #include <fftw3.h> | #include <fftw3.h> | ||||||
|   | |||||||
| @@ -460,6 +460,53 @@ class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Fi | |||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | template<class Matrix,class Field> | ||||||
|  | class QuadLinearOperator : public LinearOperatorBase<Field> { | ||||||
|  |   Matrix &_Mat; | ||||||
|  | public: | ||||||
|  |   RealD a0,a1,a2; | ||||||
|  |   QuadLinearOperator(Matrix &Mat): _Mat(Mat),a0(0.),a1(0.),a2(1.) {}; | ||||||
|  |   QuadLinearOperator(Matrix &Mat, RealD _a0,RealD _a1,RealD _a2): _Mat(Mat),a0(_a0),a1(_a1),a2(_a2) {}; | ||||||
|  |   // Support for coarsening to a multigrid | ||||||
|  |   void OpDiag (const Field &in, Field &out) { | ||||||
|  |     assert(0); | ||||||
|  |     _Mat.Mdiag(in,out); | ||||||
|  |   } | ||||||
|  |   void OpDir  (const Field &in, Field &out,int dir,int disp) { | ||||||
|  |     assert(0); | ||||||
|  |     _Mat.Mdir(in,out,dir,disp); | ||||||
|  |   } | ||||||
|  |   void OpDirAll  (const Field &in, std::vector<Field> &out){ | ||||||
|  |     assert(0); | ||||||
|  |     _Mat.MdirAll(in,out); | ||||||
|  |   } | ||||||
|  |   void HermOp (const Field &in, Field &out){ | ||||||
|  | //    _Mat.M(in,out); | ||||||
|  |     Field tmp1(in.Grid()); | ||||||
|  | //    Linop.HermOpAndNorm(psi, mmp, d, b); | ||||||
|  |     _Mat.M(in,tmp1); | ||||||
|  |     _Mat.M(tmp1,out); | ||||||
|  |     out *= a2; | ||||||
|  |     axpy(out, a1, tmp1, out); | ||||||
|  |     axpy(out, a0, in, out); | ||||||
|  | //    d=real(innerProduct(psi,mmp)); | ||||||
|  | //    b=norm2(mmp); | ||||||
|  |   } | ||||||
|  |   void AdjOp     (const Field &in, Field &out){ | ||||||
|  |     assert(0); | ||||||
|  |     _Mat.M(in,out); | ||||||
|  |   } | ||||||
|  |   void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ | ||||||
|  |     HermOp(in,out); | ||||||
|  |     ComplexD dot= innerProduct(in,out); n1=real(dot); | ||||||
|  |     n2=norm2(out); | ||||||
|  |   } | ||||||
|  |   void Op(const Field &in, Field &out){ | ||||||
|  |     assert(0); | ||||||
|  |     _Mat.M(in,out); | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta | // Left  handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta  -->  ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta | ||||||
| // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi | // Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta  -->  ( 1 - Moe Mee^-1 Meo Moo^-1) phi=eta ; psi = Moo^-1 phi | ||||||
|   | |||||||
| @@ -36,11 +36,12 @@ NAMESPACE_BEGIN(Grid); | |||||||
| // Abstract base class. | // Abstract base class. | ||||||
| // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi) | // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi) | ||||||
| // and returns a forecasted solution to the system D*psi = phi (psi). | // and returns a forecasted solution to the system D*psi = phi (psi). | ||||||
| template<class Matrix, class Field> | // Changing to operator | ||||||
|  | template<class LinearOperatorBase, class Field> | ||||||
| class Forecast | class Forecast | ||||||
| { | { | ||||||
| public: | public: | ||||||
|   virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0; |   virtual Field operator()(LinearOperatorBase &Mat, const Field& phi, const std::vector<Field>& chi) = 0; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| // Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012), | // Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012), | ||||||
| @@ -54,13 +55,13 @@ public: | |||||||
|   Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns) |   Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns) | ||||||
|   { |   { | ||||||
|     int degree = prev_solns.size(); |     int degree = prev_solns.size(); | ||||||
|  |     std::cout << GridLogMessage << "ChronoForecast: degree= " << degree << std::endl; | ||||||
|     Field chi(phi); // forecasted solution |     Field chi(phi); // forecasted solution | ||||||
|  |  | ||||||
|     // Trivial cases |     // Trivial cases | ||||||
|     if(degree == 0){ chi = Zero(); return chi; } |     if(degree == 0){ chi = Zero(); return chi; } | ||||||
|     else if(degree == 1){ return prev_solns[0]; } |     else if(degree == 1){ return prev_solns[0]; } | ||||||
|  |  | ||||||
|     //    RealD dot; |  | ||||||
|     ComplexD xp; |     ComplexD xp; | ||||||
|     Field r(phi); // residual |     Field r(phi); // residual | ||||||
|     Field Mv(phi); |     Field Mv(phi); | ||||||
| @@ -83,8 +84,9 @@ public: | |||||||
|     // Perform sparse matrix multiplication and construct rhs |     // Perform sparse matrix multiplication and construct rhs | ||||||
|     for(int i=0; i<degree; i++){ |     for(int i=0; i<degree; i++){ | ||||||
|       b[i] = innerProduct(v[i],phi); |       b[i] = innerProduct(v[i],phi); | ||||||
|       Mat.M(v[i],Mv); | //      Mat.M(v[i],Mv); | ||||||
|       Mat.Mdag(Mv,MdagMv[i]); | //      Mat.Mdag(Mv,MdagMv[i]); | ||||||
|  |       Mat.HermOp(v[i],MdagMv[i]); | ||||||
|       G[i][i] = innerProduct(v[i],MdagMv[i]); |       G[i][i] = innerProduct(v[i],MdagMv[i]); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k, | |||||||
|  * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and |  * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and | ||||||
|  * type = 1 for the approximation which is infinite at x = 0. */ |  * type = 1 for the approximation which is infinite at x = 0. */ | ||||||
|  |  | ||||||
| zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) { | zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) { | ||||||
|   INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F, |   INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F, | ||||||
|     l, invlambda, xi, xisq, *tv, s, opl; |     l, invlambda, xi, xisq, *tv, s, opl; | ||||||
|   int m, czero, ts; |   int m, czero, ts; | ||||||
| @@ -375,12 +375,12 @@ zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) { | |||||||
|   construct_partfrac(d); |   construct_partfrac(d); | ||||||
|   construct_contfrac(d); |   construct_contfrac(d); | ||||||
|  |  | ||||||
|   /* Converting everything to ZOLO_PRECISION for external use only */ |   /* Converting everything to PRECISION for external use only */ | ||||||
|  |  | ||||||
|   zd = (zolotarev_data*) malloc(sizeof(zolotarev_data)); |   zd = (zolotarev_data*) malloc(sizeof(zolotarev_data)); | ||||||
|   zd -> A = (ZOLO_PRECISION) d -> A; |   zd -> A = (PRECISION) d -> A; | ||||||
|   zd -> Delta = (ZOLO_PRECISION) d -> Delta; |   zd -> Delta = (PRECISION) d -> Delta; | ||||||
|   zd -> epsilon = (ZOLO_PRECISION) d -> epsilon; |   zd -> epsilon = (PRECISION) d -> epsilon; | ||||||
|   zd -> n = d -> n; |   zd -> n = d -> n; | ||||||
|   zd -> type = d -> type; |   zd -> type = d -> type; | ||||||
|   zd -> dn = d -> dn; |   zd -> dn = d -> dn; | ||||||
| @@ -390,24 +390,24 @@ zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) { | |||||||
|   zd -> deg_num = d -> deg_num; |   zd -> deg_num = d -> deg_num; | ||||||
|   zd -> deg_denom = d -> deg_denom; |   zd -> deg_denom = d -> deg_denom; | ||||||
|  |  | ||||||
|   zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION)); |   zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION)); | ||||||
|   for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m]; |   for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m]; | ||||||
|   free(d -> a); |   free(d -> a); | ||||||
|  |  | ||||||
|   zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION)); |   zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION)); | ||||||
|   for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m]; |   for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m]; | ||||||
|   free(d -> ap); |   free(d -> ap); | ||||||
|  |  | ||||||
|   zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION)); |   zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION)); | ||||||
|   for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m]; |   for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m]; | ||||||
|   free(d -> alpha); |   free(d -> alpha); | ||||||
|  |  | ||||||
|   zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION)); |   zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION)); | ||||||
|   for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m]; |   for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m]; | ||||||
|   free(d -> beta); |   free(d -> beta); | ||||||
|  |  | ||||||
|   zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION)); |   zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION)); | ||||||
|   for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m]; |   for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m]; | ||||||
|   free(d -> gamma); |   free(d -> gamma); | ||||||
|  |  | ||||||
|   free(d); |   free(d); | ||||||
| @@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata) | |||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) { | zolotarev_data* higham(PRECISION epsilon, int n) { | ||||||
|   INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq; |   INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq; | ||||||
|   int m, czero; |   int m, czero; | ||||||
|   zolotarev_data *zd; |   zolotarev_data *zd; | ||||||
| @@ -481,9 +481,9 @@ zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) { | |||||||
|   /* Converting everything to PRECISION for external use only */ |   /* Converting everything to PRECISION for external use only */ | ||||||
|  |  | ||||||
|   zd = (zolotarev_data*) malloc(sizeof(zolotarev_data)); |   zd = (zolotarev_data*) malloc(sizeof(zolotarev_data)); | ||||||
|   zd -> A = (ZOLO_PRECISION) d -> A; |   zd -> A = (PRECISION) d -> A; | ||||||
|   zd -> Delta = (ZOLO_PRECISION) d -> Delta; |   zd -> Delta = (PRECISION) d -> Delta; | ||||||
|   zd -> epsilon = (ZOLO_PRECISION) d -> epsilon; |   zd -> epsilon = (PRECISION) d -> epsilon; | ||||||
|   zd -> n = d -> n; |   zd -> n = d -> n; | ||||||
|   zd -> type = d -> type; |   zd -> type = d -> type; | ||||||
|   zd -> dn = d -> dn; |   zd -> dn = d -> dn; | ||||||
| @@ -493,24 +493,24 @@ zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) { | |||||||
|   zd -> deg_num = d -> deg_num; |   zd -> deg_num = d -> deg_num; | ||||||
|   zd -> deg_denom = d -> deg_denom; |   zd -> deg_denom = d -> deg_denom; | ||||||
|  |  | ||||||
|   zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION)); |   zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION)); | ||||||
|   for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m]; |   for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m]; | ||||||
|   free(d -> a); |   free(d -> a); | ||||||
|  |  | ||||||
|   zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION)); |   zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION)); | ||||||
|   for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m]; |   for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m]; | ||||||
|   free(d -> ap); |   free(d -> ap); | ||||||
|  |  | ||||||
|   zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION)); |   zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION)); | ||||||
|   for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m]; |   for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m]; | ||||||
|   free(d -> alpha); |   free(d -> alpha); | ||||||
|  |  | ||||||
|   zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION)); |   zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION)); | ||||||
|   for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m]; |   for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m]; | ||||||
|   free(d -> beta); |   free(d -> beta); | ||||||
|  |  | ||||||
|   zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION)); |   zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION)); | ||||||
|   for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m]; |   for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m]; | ||||||
|   free(d -> gamma); |   free(d -> gamma); | ||||||
|  |  | ||||||
|   free(d); |   free(d); | ||||||
| @@ -523,17 +523,17 @@ NAMESPACE_END(Grid); | |||||||
| #ifdef TEST | #ifdef TEST | ||||||
|  |  | ||||||
| #undef ZERO | #undef ZERO | ||||||
| #define ZERO ((ZOLO_PRECISION) 0) | #define ZERO ((PRECISION) 0) | ||||||
| #undef ONE | #undef ONE | ||||||
| #define ONE ((ZOLO_PRECISION) 1) | #define ONE ((PRECISION) 1) | ||||||
| #undef TWO | #undef TWO | ||||||
| #define TWO ((ZOLO_PRECISION) 2) | #define TWO ((PRECISION) 2) | ||||||
|  |  | ||||||
| /* Evaluate the rational approximation R(x) using the factored form */ | /* Evaluate the rational approximation R(x) using the factored form */ | ||||||
|  |  | ||||||
| static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { | static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) { | ||||||
|   int m; |   int m; | ||||||
|   ZOLO_PRECISION R; |   PRECISION R; | ||||||
|  |  | ||||||
|   if (rdata -> type == 0) { |   if (rdata -> type == 0) { | ||||||
|     R = rdata -> A * x; |     R = rdata -> A * x; | ||||||
| @@ -551,9 +551,9 @@ static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { | |||||||
|  |  | ||||||
| /* Evaluate the rational approximation R(x) using the partial fraction form */ | /* Evaluate the rational approximation R(x) using the partial fraction form */ | ||||||
|  |  | ||||||
| static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { | static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) { | ||||||
|   int m; |   int m; | ||||||
|   ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1]; |   PRECISION R = rdata -> alpha[rdata -> da - 1]; | ||||||
|   for (m = 0; m < rdata -> dd; m++) |   for (m = 0; m < rdata -> dd; m++) | ||||||
|     R += rdata -> alpha[m] / (x * x - rdata -> ap[m]); |     R += rdata -> alpha[m] / (x * x - rdata -> ap[m]); | ||||||
|   if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x); |   if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x); | ||||||
| @@ -568,18 +568,18 @@ static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* | |||||||
|  * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0, |  * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0, | ||||||
|  * but with signalling overflow you will get an error message. */ |  * but with signalling overflow you will get an error message. */ | ||||||
|  |  | ||||||
| static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { | static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) { | ||||||
|   int m; |   int m; | ||||||
|   ZOLO_PRECISION R = rdata -> beta[0] * x; |   PRECISION R = rdata -> beta[0] * x; | ||||||
|   for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R; |   for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R; | ||||||
|   return R; |   return R; | ||||||
| }     | }     | ||||||
|  |  | ||||||
| /* Evaluate the rational approximation R(x) using Cayley form */ | /* Evaluate the rational approximation R(x) using Cayley form */ | ||||||
|  |  | ||||||
| static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) { | static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) { | ||||||
|   int m; |   int m; | ||||||
|   ZOLO_PRECISION T; |   PRECISION T; | ||||||
|  |  | ||||||
|   T = rdata -> type == 0 ? ONE : -ONE; |   T = rdata -> type == 0 ? ONE : -ONE; | ||||||
|   for (m = 0; m < rdata -> n; m++) |   for (m = 0; m < rdata -> n; m++) | ||||||
| @@ -607,7 +607,7 @@ int main(int argc, char** argv) { | |||||||
|   int m, n, plotpts = 5000, type = 0; |   int m, n, plotpts = 5000, type = 0; | ||||||
|   float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr; |   float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr; | ||||||
|   zolotarev_data *rdata; |   zolotarev_data *rdata; | ||||||
|   ZOLO_PRECISION y; |   PRECISION y; | ||||||
|   FILE *plot_function, *plot_error,  |   FILE *plot_function, *plot_error,  | ||||||
|     *plot_partfrac, *plot_contfrac, *plot_cayley; |     *plot_partfrac, *plot_contfrac, *plot_cayley; | ||||||
|  |  | ||||||
| @@ -626,13 +626,13 @@ int main(int argc, char** argv) { | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   rdata = type == 2  |   rdata = type == 2  | ||||||
|     ? higham((ZOLO_PRECISION) eps, n)  |     ? higham((PRECISION) eps, n)  | ||||||
|     : zolotarev((ZOLO_PRECISION) eps, n, type); |     : zolotarev((PRECISION) eps, n, type); | ||||||
|  |  | ||||||
|   printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t"  |   printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t"  | ||||||
| 	 STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION) | 	 STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION) | ||||||
| 	 "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION) | 	 "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION) | ||||||
| 	 "\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION) | 	 "\tPRECISION = " STRINGIFY(PRECISION) | ||||||
| 	 "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n" | 	 "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n" | ||||||
| 	 "\tDelta = %g (maximum error)\n\n" | 	 "\tDelta = %g (maximum error)\n\n" | ||||||
| 	 "\tA = %g (overall factor)\n", | 	 "\tA = %g (overall factor)\n", | ||||||
| @@ -681,15 +681,15 @@ int main(int argc, char** argv) { | |||||||
|     x = 2.4 * (float) m / plotpts - 1.2; |     x = 2.4 * (float) m / plotpts - 1.2; | ||||||
|     if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) { |     if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) { | ||||||
|       /* skip x = 0 for type 1, as R(0) is singular */ |       /* skip x = 0 for type 1, as R(0) is singular */ | ||||||
|       y = zolotarev_eval((ZOLO_PRECISION) x, rdata); |       y = zolotarev_eval((PRECISION) x, rdata); | ||||||
|       fprintf(plot_function, "%g %g\n", x, (float) y); |       fprintf(plot_function, "%g %g\n", x, (float) y); | ||||||
|       fprintf(plot_error, "%g %g\n", |       fprintf(plot_error, "%g %g\n", | ||||||
| 	      x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta)); | 	      x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta)); | ||||||
|       ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y) |       ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y) | ||||||
| 		       / rdata -> Delta); | 		       / rdata -> Delta); | ||||||
|       ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y) |       ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y) | ||||||
| 		       / rdata -> Delta); | 		       / rdata -> Delta); | ||||||
|       ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y) |       ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y) | ||||||
| 		       / rdata -> Delta); | 		       / rdata -> Delta); | ||||||
|       if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) { |       if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) { | ||||||
| 	maxypferr = MAX(maxypferr, fabs(ypferr)); | 	maxypferr = MAX(maxypferr, fabs(ypferr)); | ||||||
|   | |||||||
| @@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx); | |||||||
| #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY> | #define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY> | ||||||
|  |  | ||||||
| #ifndef ZOLOTAREV_INTERNAL | #ifndef ZOLOTAREV_INTERNAL | ||||||
| #ifndef ZOLO_PRECISION | #ifndef PRECISION | ||||||
| #define ZOLO_PRECISION double | #define PRECISION double | ||||||
| #endif | #endif | ||||||
| #define ZPRECISION ZOLO_PRECISION | #define ZPRECISION PRECISION | ||||||
| #define ZOLOTAREV_DATA zolotarev_data | #define ZOLOTAREV_DATA zolotarev_data | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| @@ -77,8 +77,8 @@ typedef struct { | |||||||
|  * zolotarev_data structure. The arguments must satisfy the constraints that |  * zolotarev_data structure. The arguments must satisfy the constraints that | ||||||
|  * epsilon > 0, n > 0, and type = 0 or 1. */ |  * epsilon > 0, n > 0, and type = 0 or 1. */ | ||||||
|  |  | ||||||
| ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ; | ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ; | ||||||
| ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type); | ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type); | ||||||
| void zolotarev_free(zolotarev_data *zdata); | void zolotarev_free(zolotarev_data *zdata); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| @@ -86,4 +86,3 @@ void zolotarev_free(zolotarev_data *zdata); | |||||||
| NAMESPACE_END(Approx); | NAMESPACE_END(Approx); | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,34 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |  | ||||||
|  |  | ||||||
|     Source file: BatchedBlas.h |  | ||||||
|  |  | ||||||
|     Copyright (C) 2023 |  | ||||||
|  |  | ||||||
| Author: Peter Boyle <pboyle@bnl.gov> |  | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |  | ||||||
|     it under the terms of the GNU General Public License as published by |  | ||||||
|     the Free Software Foundation; either version 2 of the License, or |  | ||||||
|     (at your option) any later version. |  | ||||||
|  |  | ||||||
|     This program is distributed in the hope that it will be useful, |  | ||||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|     GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|     You should have received a copy of the GNU General Public License along |  | ||||||
|     with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
| #include <Grid/GridCore.h> |  | ||||||
| #include <Grid/algorithms/blas/BatchedBlas.h> |  | ||||||
| NAMESPACE_BEGIN(Grid); |  | ||||||
| gridblasHandle_t GridBLAS::gridblasHandle; |  | ||||||
| int              GridBLAS::gridblasInit; |  | ||||||
| NAMESPACE_END(Grid); |  | ||||||
|  |  | ||||||
| @@ -1,727 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |  | ||||||
|  |  | ||||||
|     Source file: BatchedBlas.h |  | ||||||
|  |  | ||||||
|     Copyright (C) 2023 |  | ||||||
|  |  | ||||||
| Author: Peter Boyle <pboyle@bnl.gov> |  | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |  | ||||||
|     it under the terms of the GNU General Public License as published by |  | ||||||
|     the Free Software Foundation; either version 2 of the License, or |  | ||||||
|     (at your option) any later version. |  | ||||||
|  |  | ||||||
|     This program is distributed in the hope that it will be useful, |  | ||||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|     GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|     You should have received a copy of the GNU General Public License along |  | ||||||
|     with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
| #pragma once |  | ||||||
|  |  | ||||||
| #ifdef GRID_HIP |  | ||||||
| #include <hipblas/hipblas.h> |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_CUDA |  | ||||||
| #include <cublas_v2.h> |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_SYCL |  | ||||||
| #include <oneapi/mkl.hpp> |  | ||||||
| #endif |  | ||||||
| #if 0 |  | ||||||
| #define GRID_ONE_MKL |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_ONE_MKL |  | ||||||
| #include <oneapi/mkl.hpp> |  | ||||||
| #endif |  | ||||||
| ///////////////////////////////////////////////////////////////////////	   |  | ||||||
| // Need to rearrange lattice data to be in the right format for a |  | ||||||
| // batched multiply. Might as well make these static, dense packed |  | ||||||
| /////////////////////////////////////////////////////////////////////// |  | ||||||
| NAMESPACE_BEGIN(Grid); |  | ||||||
| #ifdef GRID_HIP |  | ||||||
|   typedef hipblasHandle_t gridblasHandle_t; |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_CUDA |  | ||||||
|   typedef cublasHandle_t gridblasHandle_t; |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_SYCL |  | ||||||
|   typedef cl::sycl::queue *gridblasHandle_t; |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_ONE_MKL |  | ||||||
|   typedef cl::sycl::queue *gridblasHandle_t; |  | ||||||
| #endif |  | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) |  | ||||||
|   typedef int32_t gridblasHandle_t; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ; |  | ||||||
|  |  | ||||||
| class GridBLAS { |  | ||||||
| public: |  | ||||||
|  |  | ||||||
|    |  | ||||||
|   static gridblasHandle_t gridblasHandle; |  | ||||||
|   static int            gridblasInit; |  | ||||||
|    |  | ||||||
|   static void Init(void) |  | ||||||
|   { |  | ||||||
|     if ( ! gridblasInit ) { |  | ||||||
| #ifdef GRID_CUDA |  | ||||||
|       std::cout << "cublasCreate"<<std::endl; |  | ||||||
|       cublasCreate(&gridblasHandle); |  | ||||||
|       cublasSetPointerMode(gridblasHandle, CUBLAS_POINTER_MODE_DEVICE); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_HIP |  | ||||||
|       std::cout << "hipblasCreate"<<std::endl; |  | ||||||
|       hipblasCreate(&gridblasHandle); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_SYCL |  | ||||||
|       gridblasHandle = theGridAccelerator; |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_ONE_MKL |  | ||||||
|       cl::sycl::cpu_selector selector; |  | ||||||
|       cl::sycl::device selectedDevice { selector }; |  | ||||||
|       gridblasHandle =new sycl::queue (selectedDevice); |  | ||||||
| #endif |  | ||||||
|       gridblasInit=1; |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   // Force construct once |  | ||||||
|   GridBLAS() { Init(); }; |  | ||||||
|   ~GridBLAS() { }; |  | ||||||
|    |  | ||||||
|   ///////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // BLAS GEMM conventions: |  | ||||||
|   ///////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // - C = alpha A * B + beta C |  | ||||||
|   // Dimensions: |  | ||||||
|   // - C_m.n |  | ||||||
|   // - A_m.k |  | ||||||
|   // - B_k.n |  | ||||||
|   // - Flops = 8 M N K |  | ||||||
|   // - Bytes = 2*sizeof(word) * (MN+MK+KN) |  | ||||||
|   // M=60, N=12 |  | ||||||
|   // Flop/Byte = 8 . 60.60.12 / (60.12+60.60+60.12)/16 = 4 so expect about 4 TF/s on a GCD |  | ||||||
|   ///////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   void synchronise(void) |  | ||||||
|   { |  | ||||||
| #ifdef GRID_HIP |  | ||||||
|     auto err = hipDeviceSynchronize(); |  | ||||||
|     assert(err==hipSuccess); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_CUDA |  | ||||||
|     auto err = cudaDeviceSynchronize(); |  | ||||||
|     assert(err==cudaSuccess); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_SYCL |  | ||||||
|     accelerator_barrier(); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_ONE_MKL |  | ||||||
|     gridblasHandle->wait(); |  | ||||||
| #endif |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   void gemmBatched(int m,int n, int k, |  | ||||||
| 		   ComplexD alpha, |  | ||||||
| 		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices |  | ||||||
| 		   deviceVector<ComplexD*> &Bkn, |  | ||||||
| 		   ComplexD beta, |  | ||||||
| 		   deviceVector<ComplexD*> &Cmn) |  | ||||||
|   { |  | ||||||
|     gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, |  | ||||||
| 		m,n,k, |  | ||||||
| 		alpha, |  | ||||||
| 		Amk, |  | ||||||
| 		Bkn, |  | ||||||
| 		beta, |  | ||||||
| 		Cmn); |  | ||||||
|   } |  | ||||||
|   void gemmBatched(int m,int n, int k, |  | ||||||
| 		   ComplexF alpha, |  | ||||||
| 		   deviceVector<ComplexF*> &Amk,  // pointer list to matrices |  | ||||||
| 		   deviceVector<ComplexF*> &Bkn, |  | ||||||
| 		   ComplexF beta, |  | ||||||
| 		   deviceVector<ComplexF*> &Cmn) |  | ||||||
|   { |  | ||||||
|     gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, |  | ||||||
| 		m,n,k, |  | ||||||
| 		alpha, |  | ||||||
| 		Amk, |  | ||||||
| 		Bkn, |  | ||||||
| 		beta, |  | ||||||
| 		Cmn); |  | ||||||
|   } |  | ||||||
|   void gemmBatched(int m,int n, int k, |  | ||||||
| 		   RealD alpha, |  | ||||||
| 		   deviceVector<RealD*> &Amk,  // pointer list to matrices |  | ||||||
| 		   deviceVector<RealD*> &Bkn, |  | ||||||
| 		   RealD beta, |  | ||||||
| 		   deviceVector<RealD*> &Cmn) |  | ||||||
|   { |  | ||||||
|     gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, |  | ||||||
| 		m,n,k, |  | ||||||
| 		alpha, |  | ||||||
| 		Amk, |  | ||||||
| 		Bkn, |  | ||||||
| 		beta, |  | ||||||
| 		Cmn); |  | ||||||
|   } |  | ||||||
|   void gemmBatched(int m,int n, int k, |  | ||||||
| 		   RealF alpha, |  | ||||||
| 		   deviceVector<RealF*> &Amk,  // pointer list to matrices |  | ||||||
| 		   deviceVector<RealF*> &Bkn, |  | ||||||
| 		   RealF beta, |  | ||||||
| 		   deviceVector<RealF*> &Cmn) |  | ||||||
|   { |  | ||||||
|     gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, |  | ||||||
| 		m,n,k, |  | ||||||
| 		alpha, |  | ||||||
| 		Amk, |  | ||||||
| 		Bkn, |  | ||||||
| 		beta, |  | ||||||
| 		Cmn); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   void gemmBatched(GridBLASOperation_t OpA, |  | ||||||
| 		   GridBLASOperation_t OpB, |  | ||||||
| 		   int m,int n, int k, |  | ||||||
| 		   ComplexD alpha, |  | ||||||
| 		   deviceVector<ComplexD*> &Amk,  // pointer list to matrices |  | ||||||
| 		   deviceVector<ComplexD*> &Bkn, |  | ||||||
| 		   ComplexD beta, |  | ||||||
| 		   deviceVector<ComplexD*> &Cmn) |  | ||||||
|   { |  | ||||||
|     RealD t2=usecond(); |  | ||||||
|     int32_t batchCount = Amk.size(); |  | ||||||
|     assert(Bkn.size()==batchCount); |  | ||||||
|     assert(Cmn.size()==batchCount); |  | ||||||
|  |  | ||||||
|     int lda = m; // m x k column major |  | ||||||
|     int ldb = k; // k x n column major |  | ||||||
|     int ldc = m; // m x b column major |  | ||||||
|     if(OpA!=GridBLAS_OP_N) |  | ||||||
|       lda = k; |  | ||||||
|     if(OpB!=GridBLAS_OP_N) |  | ||||||
|       ldb = n; |  | ||||||
|      |  | ||||||
|     static deviceVector<ComplexD> alpha_p(1); |  | ||||||
|     static deviceVector<ComplexD> beta_p(1); |  | ||||||
|     // can prestore the 1 and the zero on device |  | ||||||
|     acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); |  | ||||||
|     acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); |  | ||||||
|     RealD t0=usecond(); |  | ||||||
|     //    std::cout << "ZgemmBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl; |  | ||||||
| #ifdef GRID_HIP |  | ||||||
|     hipblasOperation_t hOpA; |  | ||||||
|     hipblasOperation_t hOpB; |  | ||||||
|     if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; |  | ||||||
|     if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; |  | ||||||
|     if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; |  | ||||||
|     if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; |  | ||||||
|     if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; |  | ||||||
|     if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; |  | ||||||
|     auto err = hipblasZgemmBatched(gridblasHandle, |  | ||||||
| 				   hOpA, |  | ||||||
| 				   hOpB, |  | ||||||
| 				   m,n,k, |  | ||||||
| 				   (hipblasDoubleComplex *) &alpha_p[0], |  | ||||||
| 				   (hipblasDoubleComplex **)&Amk[0], lda, |  | ||||||
| 				   (hipblasDoubleComplex **)&Bkn[0], ldb, |  | ||||||
| 				   (hipblasDoubleComplex *) &beta_p[0], |  | ||||||
| 				   (hipblasDoubleComplex **)&Cmn[0], ldc, |  | ||||||
| 				   batchCount); |  | ||||||
|     //	 std::cout << " hipblas return code " <<(int)err<<std::endl; |  | ||||||
|     assert(err==HIPBLAS_STATUS_SUCCESS); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_CUDA |  | ||||||
|     cublasOperation_t hOpA; |  | ||||||
|     cublasOperation_t hOpB; |  | ||||||
|     if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; |  | ||||||
|     if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; |  | ||||||
|     if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; |  | ||||||
|     if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; |  | ||||||
|     if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; |  | ||||||
|     if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; |  | ||||||
|     auto err = cublasZgemmBatched(gridblasHandle, |  | ||||||
| 				  hOpA, |  | ||||||
| 				  hOpB, |  | ||||||
| 				  m,n,k, |  | ||||||
| 				  (cuDoubleComplex *) &alpha_p[0], |  | ||||||
| 				  (cuDoubleComplex **)&Amk[0], lda, |  | ||||||
| 				  (cuDoubleComplex **)&Bkn[0], ldb, |  | ||||||
| 				  (cuDoubleComplex *) &beta_p[0], |  | ||||||
| 				  (cuDoubleComplex **)&Cmn[0], ldc, |  | ||||||
| 				  batchCount); |  | ||||||
|     assert(err==CUBLAS_STATUS_SUCCESS); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_SYCL |  | ||||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI |  | ||||||
| #warning "oneMKL implementation not built " |  | ||||||
| #endif |  | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) |  | ||||||
|     // Need a default/reference implementation |  | ||||||
|     int sda = lda*k; |  | ||||||
|     int sdb = ldb*k; |  | ||||||
|     int sdc = ldc*n; |  | ||||||
|     for (int p = 0; p < batchCount; ++p) { |  | ||||||
|       for (int mm = 0; mm < m; ++mm) { |  | ||||||
| 	for (int nn = 0; nn < n; ++nn) { |  | ||||||
| 	  ComplexD c_mn(0.0); |  | ||||||
| 	  for (int kk = 0; kk < k; ++kk) |  | ||||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; |  | ||||||
| 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
| #endif |  | ||||||
|     //    synchronise(); |  | ||||||
|      RealD t1=usecond(); |  | ||||||
|      RealD flops = 8.0*m*n*k*batchCount; |  | ||||||
|      RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount; |  | ||||||
|      //     std::cout <<GridLogMessage<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl; |  | ||||||
|      //     std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl; |  | ||||||
|      //     std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   void gemmBatched(GridBLASOperation_t OpA, |  | ||||||
| 		   GridBLASOperation_t OpB, |  | ||||||
| 		   int m,int n, int k, |  | ||||||
| 		   ComplexF alpha, |  | ||||||
| 		   deviceVector<ComplexF*> &Amk,  // pointer list to matrices |  | ||||||
| 		   deviceVector<ComplexF*> &Bkn, |  | ||||||
| 		   ComplexF beta, |  | ||||||
| 		   deviceVector<ComplexF*> &Cmn) |  | ||||||
|   { |  | ||||||
|     RealD t2=usecond(); |  | ||||||
|     int32_t batchCount = Amk.size(); |  | ||||||
|  |  | ||||||
|     int lda = m; // m x k column major |  | ||||||
|     int ldb = k; // k x n column major |  | ||||||
|     int ldc = m; // m x b column major |  | ||||||
|     if(OpA!=GridBLAS_OP_N) |  | ||||||
|       lda = k; |  | ||||||
|     if(OpB!=GridBLAS_OP_N) |  | ||||||
|       ldb = n; |  | ||||||
|     static deviceVector<ComplexF> alpha_p(1); |  | ||||||
|     static deviceVector<ComplexF> beta_p(1); |  | ||||||
|     // can prestore the 1 and the zero on device |  | ||||||
|     acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF)); |  | ||||||
|     acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF)); |  | ||||||
|     RealD t0=usecond(); |  | ||||||
|  |  | ||||||
|     assert(Bkn.size()==batchCount); |  | ||||||
|     assert(Cmn.size()==batchCount); |  | ||||||
| #ifdef GRID_HIP |  | ||||||
|     hipblasOperation_t hOpA; |  | ||||||
|     hipblasOperation_t hOpB; |  | ||||||
|     if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; |  | ||||||
|     if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; |  | ||||||
|     if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; |  | ||||||
|     if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; |  | ||||||
|     if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; |  | ||||||
|     if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; |  | ||||||
|     auto err = hipblasCgemmBatched(gridblasHandle, |  | ||||||
| 				   hOpA, |  | ||||||
| 				   hOpB, |  | ||||||
| 				   m,n,k, |  | ||||||
| 				   (hipblasComplex *) &alpha_p[0], |  | ||||||
| 				   (hipblasComplex **)&Amk[0], lda, |  | ||||||
| 				   (hipblasComplex **)&Bkn[0], ldb, |  | ||||||
| 				   (hipblasComplex *) &beta_p[0], |  | ||||||
| 				   (hipblasComplex **)&Cmn[0], ldc, |  | ||||||
| 				   batchCount); |  | ||||||
|  |  | ||||||
|     assert(err==HIPBLAS_STATUS_SUCCESS); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_CUDA |  | ||||||
|     cublasOperation_t hOpA; |  | ||||||
|     cublasOperation_t hOpB; |  | ||||||
|     if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; |  | ||||||
|     if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; |  | ||||||
|     if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; |  | ||||||
|     if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; |  | ||||||
|     if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; |  | ||||||
|     if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; |  | ||||||
|     auto err = cublasCgemmBatched(gridblasHandle, |  | ||||||
| 				  hOpA, |  | ||||||
| 				  hOpB, |  | ||||||
| 				  m,n,k, |  | ||||||
| 				  (cuComplex *) &alpha_p[0], |  | ||||||
| 				  (cuComplex **)&Amk[0], lda, |  | ||||||
| 				  (cuComplex **)&Bkn[0], ldb, |  | ||||||
| 				  (cuComplex *) &beta_p[0], |  | ||||||
| 				  (cuComplex **)&Cmn[0], ldc, |  | ||||||
| 				  batchCount); |  | ||||||
|     assert(err==CUBLAS_STATUS_SUCCESS); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_SYCL |  | ||||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI |  | ||||||
| #warning "oneMKL implementation not built " |  | ||||||
| #endif |  | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) |  | ||||||
|     int sda = lda*k; |  | ||||||
|     int sdb = ldb*k; |  | ||||||
|     int sdc = ldc*n; |  | ||||||
|     ComplexF alphaf(real(alpha),imag(alpha)); |  | ||||||
|     ComplexF betaf(real(beta),imag(beta)); |  | ||||||
|     // Need a default/reference implementation |  | ||||||
|     for (int p = 0; p < batchCount; ++p) { |  | ||||||
|       for (int mm = 0; mm < m; ++mm) { |  | ||||||
| 	for (int nn = 0; nn < n; ++nn) { |  | ||||||
| 	  ComplexF c_mn(0.0); |  | ||||||
| 	  for (int kk = 0; kk < k; ++kk) |  | ||||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; |  | ||||||
| 	  Cmn[p][mm + nn*ldc] =  (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ]; |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
| #endif |  | ||||||
|      RealD t1=usecond(); |  | ||||||
|      RealD flops = 8.0*m*n*k*batchCount; |  | ||||||
|      RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   /////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // Single precision real GEMM |  | ||||||
|   /////////////////////////////////////////////////////////////////////////// |  | ||||||
|  |  | ||||||
|   void gemmBatched(GridBLASOperation_t OpA, |  | ||||||
| 		   GridBLASOperation_t OpB, |  | ||||||
| 		   int m,int n, int k, |  | ||||||
| 		   RealF alpha, |  | ||||||
| 		   deviceVector<RealF*> &Amk,  // pointer list to matrices |  | ||||||
| 		   deviceVector<RealF*> &Bkn, |  | ||||||
| 		   RealF beta, |  | ||||||
| 		   deviceVector<RealF*> &Cmn) |  | ||||||
|   { |  | ||||||
|     RealD t2=usecond(); |  | ||||||
|     int32_t batchCount = Amk.size(); |  | ||||||
|  |  | ||||||
|     int lda = m; // m x k column major |  | ||||||
|     int ldb = k; // k x n column major |  | ||||||
|     int ldc = m; // m x b column major |  | ||||||
|     if(OpA!=GridBLAS_OP_N) |  | ||||||
|       lda = k; |  | ||||||
|     if(OpB!=GridBLAS_OP_N) |  | ||||||
|       ldb = n; |  | ||||||
|     static deviceVector<RealF> alpha_p(1); |  | ||||||
|     static deviceVector<RealF> beta_p(1); |  | ||||||
|     // can prestore the 1 and the zero on device |  | ||||||
|     acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF)); |  | ||||||
|     acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF)); |  | ||||||
|     RealD t0=usecond(); |  | ||||||
|  |  | ||||||
|     assert(Bkn.size()==batchCount); |  | ||||||
|     assert(Cmn.size()==batchCount); |  | ||||||
| #ifdef GRID_HIP |  | ||||||
|     hipblasOperation_t hOpA; |  | ||||||
|     hipblasOperation_t hOpB; |  | ||||||
|     if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; |  | ||||||
|     if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; |  | ||||||
|     if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; |  | ||||||
|     if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; |  | ||||||
|     if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; |  | ||||||
|     if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; |  | ||||||
|     auto err = hipblasSgemmBatched(gridblasHandle, |  | ||||||
| 				   hOpA, |  | ||||||
| 				   hOpB, |  | ||||||
| 				   m,n,k, |  | ||||||
| 				   (float *) &alpha_p[0], |  | ||||||
| 				   (float **)&Amk[0], lda, |  | ||||||
| 				   (float **)&Bkn[0], ldb, |  | ||||||
| 				   (float *) &beta_p[0], |  | ||||||
| 				   (float **)&Cmn[0], ldc, |  | ||||||
| 				   batchCount); |  | ||||||
|     assert(err==HIPBLAS_STATUS_SUCCESS); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_CUDA |  | ||||||
|     cublasOperation_t hOpA; |  | ||||||
|     cublasOperation_t hOpB; |  | ||||||
|     if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; |  | ||||||
|     if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; |  | ||||||
|     if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; |  | ||||||
|     if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; |  | ||||||
|     if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; |  | ||||||
|     if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; |  | ||||||
|     auto err = cublasSgemmBatched(gridblasHandle, |  | ||||||
| 				  hOpA, |  | ||||||
| 				  hOpB, |  | ||||||
| 				  m,n,k, |  | ||||||
| 				  (float *) &alpha_p[0], |  | ||||||
| 				  (float **)&Amk[0], lda, |  | ||||||
| 				  (float **)&Bkn[0], ldb, |  | ||||||
| 				  (float *) &beta_p[0], |  | ||||||
| 				  (float **)&Cmn[0], ldc, |  | ||||||
| 				  batchCount); |  | ||||||
|     assert(err==CUBLAS_STATUS_SUCCESS); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_SYCL |  | ||||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI |  | ||||||
| #warning "oneMKL implementation not built " |  | ||||||
| #endif |  | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) |  | ||||||
|     int sda = lda*k; |  | ||||||
|     int sdb = ldb*k; |  | ||||||
|     int sdc = ldc*n; |  | ||||||
|     // Need a default/reference implementation |  | ||||||
|     for (int p = 0; p < batchCount; ++p) { |  | ||||||
|       for (int mm = 0; mm < m; ++mm) { |  | ||||||
| 	for (int nn = 0; nn < n; ++nn) { |  | ||||||
| 	  RealD c_mn(0.0); |  | ||||||
| 	  for (int kk = 0; kk < k; ++kk) |  | ||||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; |  | ||||||
| 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
| #endif |  | ||||||
|      RealD t1=usecond(); |  | ||||||
|      RealD flops = 2.0*m*n*k*batchCount; |  | ||||||
|      RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|    |  | ||||||
|   /////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // Double precision real GEMM |  | ||||||
|   /////////////////////////////////////////////////////////////////////////// |  | ||||||
|  |  | ||||||
|   void gemmBatched(GridBLASOperation_t OpA, |  | ||||||
| 		   GridBLASOperation_t OpB, |  | ||||||
| 		   int m,int n, int k, |  | ||||||
| 		   RealD alpha, |  | ||||||
| 		   deviceVector<RealD*> &Amk,  // pointer list to matrices |  | ||||||
| 		   deviceVector<RealD*> &Bkn, |  | ||||||
| 		   RealD beta, |  | ||||||
| 		   deviceVector<RealD*> &Cmn) |  | ||||||
|   { |  | ||||||
|     RealD t2=usecond(); |  | ||||||
|     int32_t batchCount = Amk.size(); |  | ||||||
|  |  | ||||||
|     int lda = m; // m x k column major |  | ||||||
|     int ldb = k; // k x n column major |  | ||||||
|     int ldc = m; // m x b column major |  | ||||||
|     if(OpA!=GridBLAS_OP_N) |  | ||||||
|       lda = k; |  | ||||||
|     if(OpB!=GridBLAS_OP_N) |  | ||||||
|       ldb = n; |  | ||||||
|      |  | ||||||
|     static deviceVector<RealD> alpha_p(1); |  | ||||||
|     static deviceVector<RealD> beta_p(1); |  | ||||||
|     // can prestore the 1 and the zero on device |  | ||||||
|     acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD)); |  | ||||||
|     acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD)); |  | ||||||
|     RealD t0=usecond(); |  | ||||||
|  |  | ||||||
|     assert(Bkn.size()==batchCount); |  | ||||||
|     assert(Cmn.size()==batchCount); |  | ||||||
| #ifdef GRID_HIP |  | ||||||
|     hipblasOperation_t hOpA; |  | ||||||
|     hipblasOperation_t hOpB; |  | ||||||
|     if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; |  | ||||||
|     if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T; |  | ||||||
|     if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C; |  | ||||||
|     if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N; |  | ||||||
|     if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T; |  | ||||||
|     if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C; |  | ||||||
|     auto err = hipblasDgemmBatched(gridblasHandle, |  | ||||||
| 				   HIPBLAS_OP_N, |  | ||||||
| 				   HIPBLAS_OP_N, |  | ||||||
| 				   m,n,k, |  | ||||||
| 				   (double *) &alpha_p[0], |  | ||||||
| 				   (double **)&Amk[0], lda, |  | ||||||
| 				   (double **)&Bkn[0], ldb, |  | ||||||
| 				   (double *) &beta_p[0], |  | ||||||
| 				   (double **)&Cmn[0], ldc, |  | ||||||
| 				   batchCount); |  | ||||||
|     assert(err==HIPBLAS_STATUS_SUCCESS); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_CUDA |  | ||||||
|     cublasOperation_t hOpA; |  | ||||||
|     cublasOperation_t hOpB; |  | ||||||
|     if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N; |  | ||||||
|     if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T; |  | ||||||
|     if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C; |  | ||||||
|     if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; |  | ||||||
|     if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; |  | ||||||
|     if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; |  | ||||||
|     auto err = cublasDgemmBatched(gridblasHandle, |  | ||||||
| 				  hOpA, |  | ||||||
| 				  hOpB, |  | ||||||
| 				  m,n,k, |  | ||||||
| 				  (double *) &alpha_p[0], |  | ||||||
| 				  (double **)&Amk[0], lda, |  | ||||||
| 				  (double **)&Bkn[0], ldb, |  | ||||||
| 				  (double *) &beta_p[0], |  | ||||||
| 				  (double **)&Cmn[0], ldc, |  | ||||||
| 				  batchCount); |  | ||||||
|     assert(err==CUBLAS_STATUS_SUCCESS); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_SYCL |  | ||||||
|     /* |  | ||||||
|       int64_t m64=m; |  | ||||||
|       int64_t n64=n; |  | ||||||
|       int64_t k64=k; |  | ||||||
|       int64_t batchCount64=batchCount; |  | ||||||
|       oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator, |  | ||||||
|       onemkl::transpose::N, |  | ||||||
|       onemkl::transpose::N, |  | ||||||
|       &m64,&n64,&k64, |  | ||||||
|       (double *) &alpha_p[0], |  | ||||||
|       (double **)&Amk[0], lda, |  | ||||||
|       (double **)&Bkn[0], ldb, |  | ||||||
|       (double *) &beta_p[0], |  | ||||||
|       (double **)&Cmn[0], ldc, |  | ||||||
|       1,&batchCount64); |  | ||||||
|      */ |  | ||||||
|     //MKL’s cblas_<T>gemm_batch & OneAPI |  | ||||||
| #warning "oneMKL implementation not built " |  | ||||||
| #endif |  | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) |  | ||||||
|     int sda = lda*k; |  | ||||||
|     int sdb = ldb*k; |  | ||||||
|     int sdc = ldc*n; |  | ||||||
|     // Need a default/reference implementation |  | ||||||
|     for (int p = 0; p < batchCount; ++p) { |  | ||||||
|       for (int mm = 0; mm < m; ++mm) { |  | ||||||
| 	for (int nn = 0; nn < n; ++nn) { |  | ||||||
| 	  RealD c_mn(0.0); |  | ||||||
| 	  for (int kk = 0; kk < k; ++kk) |  | ||||||
| 	    c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb]; |  | ||||||
| 	  Cmn[p][mm + nn*ldc] =  (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ]; |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
| #endif |  | ||||||
|      RealD t1=usecond(); |  | ||||||
|      RealD flops = 2.0*m*n*k*batchCount; |  | ||||||
|      RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|  |  | ||||||
|    |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|   // Strided case used by benchmark, but generally unused in Grid |  | ||||||
|   // Keep a code example in double complex, but don't generate the single and real variants for now |  | ||||||
|   //////////////////////////////////////////////////////////////////////////////////////////////// |  | ||||||
|    |  | ||||||
|   void gemmStridedBatched(int m,int n, int k, |  | ||||||
| 			  ComplexD alpha, |  | ||||||
| 			  ComplexD* Amk,  // pointer list to matrices |  | ||||||
| 			  ComplexD* Bkn, |  | ||||||
| 			  ComplexD beta, |  | ||||||
| 			  ComplexD* Cmn, |  | ||||||
| 			  int batchCount) |  | ||||||
|   { |  | ||||||
|     // Use C-row major storage, so transpose calls |  | ||||||
|     int lda = m; // m x k column major |  | ||||||
|     int ldb = k; // k x n column major |  | ||||||
|     int ldc = m; // m x b column major |  | ||||||
|     int sda = m*k; |  | ||||||
|     int sdb = k*n; |  | ||||||
|     int sdc = m*n; |  | ||||||
|     deviceVector<ComplexD> alpha_p(1); |  | ||||||
|     deviceVector<ComplexD> beta_p(1); |  | ||||||
|     acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD)); |  | ||||||
|     acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD)); |  | ||||||
|  |  | ||||||
|     //    std::cout << "blasZgemmStridedBatched mnk  "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl; |  | ||||||
|     //    std::cout << "blasZgemmStridedBatched ld   "<<lda<<","<<ldb<<","<<ldc<<std::endl; |  | ||||||
|     //    std::cout << "blasZgemmStridedBatched sd   "<<sda<<","<<sdb<<","<<sdc<<std::endl; |  | ||||||
| #ifdef GRID_HIP |  | ||||||
|     auto err = hipblasZgemmStridedBatched(gridblasHandle, |  | ||||||
| 					  HIPBLAS_OP_N, |  | ||||||
| 					  HIPBLAS_OP_N, |  | ||||||
| 					  m,n,k, |  | ||||||
| 					  (hipblasDoubleComplex *) &alpha_p[0], |  | ||||||
| 					  (hipblasDoubleComplex *) Amk, lda, sda, |  | ||||||
| 					  (hipblasDoubleComplex *) Bkn, ldb, sdb, |  | ||||||
| 					  (hipblasDoubleComplex *) &beta_p[0], |  | ||||||
| 					  (hipblasDoubleComplex *) Cmn, ldc, sdc, |  | ||||||
| 					  batchCount); |  | ||||||
|     assert(err==HIPBLAS_STATUS_SUCCESS); |  | ||||||
| #endif |  | ||||||
| #ifdef GRID_CUDA |  | ||||||
|     cublasZgemmStridedBatched(gridblasHandle, |  | ||||||
| 			      CUBLAS_OP_N, |  | ||||||
| 			      CUBLAS_OP_N, |  | ||||||
| 			      m,n,k, |  | ||||||
| 			      (cuDoubleComplex *) &alpha_p[0], |  | ||||||
| 			      (cuDoubleComplex *) Amk, lda, sda, |  | ||||||
| 			      (cuDoubleComplex *) Bkn, ldb, sdb, |  | ||||||
| 			      (cuDoubleComplex *) &beta_p[0], |  | ||||||
| 			      (cuDoubleComplex *) Cmn, ldc, sdc, |  | ||||||
| 			      batchCount); |  | ||||||
| #endif |  | ||||||
| #if defined(GRID_SYCL) || defined(GRID_ONE_MKL) |  | ||||||
|     oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, |  | ||||||
| 						oneapi::mkl::transpose::N, |  | ||||||
| 						oneapi::mkl::transpose::N, |  | ||||||
| 						m,n,k, |  | ||||||
| 						alpha, |  | ||||||
| 						(const ComplexD *)Amk,lda,sda, |  | ||||||
| 						(const ComplexD *)Bkn,ldb,sdb, |  | ||||||
| 						beta, |  | ||||||
| 						(ComplexD *)Cmn,ldc,sdc, |  | ||||||
| 						batchCount); |  | ||||||
| #endif |  | ||||||
| #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) |  | ||||||
|      // Need a default/reference implementation |  | ||||||
|      for (int p = 0; p < batchCount; ++p) { |  | ||||||
|        for (int mm = 0; mm < m; ++mm) { |  | ||||||
| 	 for (int nn = 0; nn < n; ++nn) { |  | ||||||
| 	   ComplexD c_mn(0.0); |  | ||||||
| 	   for (int kk = 0; kk < k; ++kk) |  | ||||||
| 	     c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb]; |  | ||||||
| 	   Cmn[mm + nn*ldc + p*sdc] =  (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc]; |  | ||||||
| 	 } |  | ||||||
|        } |  | ||||||
|      } |  | ||||||
| #endif |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   double benchmark(int M, int N, int K, int BATCH) |  | ||||||
|   { |  | ||||||
|     int32_t N_A = M*K*BATCH; |  | ||||||
|     int32_t N_B = K*N*BATCH; |  | ||||||
|     int32_t N_C = M*N*BATCH; |  | ||||||
|     deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD)); |  | ||||||
|     deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD)); |  | ||||||
|     deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD)); |  | ||||||
|     ComplexD alpha(1.0); |  | ||||||
|     ComplexD beta (1.0); |  | ||||||
|     RealD flops = 8.0*M*N*K*BATCH; |  | ||||||
|     int ncall=10; |  | ||||||
|     RealD t0 = usecond(); |  | ||||||
|     for(int i=0;i<ncall;i++){ |  | ||||||
|       gemmStridedBatched(M,N,K, |  | ||||||
| 			 alpha, |  | ||||||
| 			 &A[0], // m x k  |  | ||||||
| 			 &B[0], // k x n |  | ||||||
| 			 beta,  |  | ||||||
| 			 &C[0], // m x n |  | ||||||
| 			 BATCH); |  | ||||||
|     } |  | ||||||
|     synchronise(); |  | ||||||
|     RealD t1 = usecond(); |  | ||||||
|     RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH; |  | ||||||
|     flops = 8.0*M*N*K*BATCH*ncall; |  | ||||||
|     flops = flops/(t1-t0)/1.e3; |  | ||||||
|     return flops; // Returns gigaflops |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); |  | ||||||
| @@ -176,7 +176,6 @@ template<class T> using cshiftAllocator = std::allocator<T>; | |||||||
| template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;            | template<class T> using Vector        = std::vector<T,uvmAllocator<T> >;            | ||||||
| template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;            | template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;            | ||||||
| template<class T> using commVector = std::vector<T,devAllocator<T> >; | template<class T> using commVector = std::vector<T,devAllocator<T> >; | ||||||
| template<class T> using deviceVector  = std::vector<T,devAllocator<T> >; |  | ||||||
| template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >; | template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >; | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|   | |||||||
| @@ -348,7 +348,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, | |||||||
|   return offbytes; |   return offbytes; | ||||||
| } | } | ||||||
|  |  | ||||||
| #undef NVLINK_GET // Define to use get instead of put DMA |  | ||||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
| 							 void *xmit, | 							 void *xmit, | ||||||
| 							 int dest,int dox, | 							 int dest,int dox, | ||||||
| @@ -381,15 +380,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
|       list.push_back(rrq); |       list.push_back(rrq); | ||||||
|       off_node_bytes+=rbytes; |       off_node_bytes+=rbytes; | ||||||
|     } |     } | ||||||
| #ifdef NVLINK_GET |  | ||||||
|       void *shm = (void *) this->ShmBufferTranslate(from,xmit); |  | ||||||
|       assert(shm!=NULL); |  | ||||||
|       acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); |  | ||||||
| #endif |  | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   if (dox) { |   if (dox) { | ||||||
|     //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes); |  | ||||||
|     if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { |     if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { | ||||||
|       tag= dir+_processor*32; |       tag= dir+_processor*32; | ||||||
|       ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); |       ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); | ||||||
| @@ -397,12 +390,9 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
|       list.push_back(xrq); |       list.push_back(xrq); | ||||||
|       off_node_bytes+=xbytes; |       off_node_bytes+=xbytes; | ||||||
|     } else { |     } else { | ||||||
| #ifndef NVLINK_GET |  | ||||||
|       void *shm = (void *) this->ShmBufferTranslate(dest,recv); |       void *shm = (void *) this->ShmBufferTranslate(dest,recv); | ||||||
|       assert(shm!=NULL); |       assert(shm!=NULL); | ||||||
|       acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); |       acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); | ||||||
| #endif |  | ||||||
|        |  | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -412,8 +402,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque | |||||||
| { | { | ||||||
|   int nreq=list.size(); |   int nreq=list.size(); | ||||||
|  |  | ||||||
|   acceleratorCopySynchronise(); |  | ||||||
|  |  | ||||||
|   if (nreq==0) return; |   if (nreq==0) return; | ||||||
|  |  | ||||||
|   std::vector<MPI_Status> status(nreq); |   std::vector<MPI_Status> status(nreq); | ||||||
|   | |||||||
| @@ -40,9 +40,6 @@ int                 GlobalSharedMemory::_ShmAlloc; | |||||||
| uint64_t            GlobalSharedMemory::_ShmAllocBytes; | uint64_t            GlobalSharedMemory::_ShmAllocBytes; | ||||||
|  |  | ||||||
| std::vector<void *> GlobalSharedMemory::WorldShmCommBufs; | std::vector<void *> GlobalSharedMemory::WorldShmCommBufs; | ||||||
| #ifndef ACCELERATOR_AWARE_MPI |  | ||||||
| void * GlobalSharedMemory::HostCommBuf; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm; | Grid_MPI_Comm       GlobalSharedMemory::WorldShmComm; | ||||||
| int                 GlobalSharedMemory::WorldShmRank; | int                 GlobalSharedMemory::WorldShmRank; | ||||||
| @@ -69,26 +66,6 @@ void GlobalSharedMemory::SharedMemoryFree(void) | |||||||
| ///////////////////////////////// | ///////////////////////////////// | ||||||
| // Alloc, free shmem region | // Alloc, free shmem region | ||||||
| ///////////////////////////////// | ///////////////////////////////// | ||||||
| #ifndef ACCELERATOR_AWARE_MPI |  | ||||||
| void *SharedMemory::HostBufferMalloc(size_t bytes){ |  | ||||||
|   void *ptr = (void *)host_heap_top; |  | ||||||
|   host_heap_top  += bytes; |  | ||||||
|   host_heap_bytes+= bytes; |  | ||||||
|   if (host_heap_bytes >= host_heap_size) { |  | ||||||
|     std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl; |  | ||||||
|     std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl; |  | ||||||
|     std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl; |  | ||||||
|     std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl; |  | ||||||
|     std::cout<< " Current heap  is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl; |  | ||||||
|     assert(host_heap_bytes<host_heap_size); |  | ||||||
|   } |  | ||||||
|   return ptr; |  | ||||||
| } |  | ||||||
| void SharedMemory::HostBufferFreeAll(void) {  |  | ||||||
|   host_heap_top  =(size_t)HostCommBuf; |  | ||||||
|   host_heap_bytes=0; |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
| void *SharedMemory::ShmBufferMalloc(size_t bytes){ | void *SharedMemory::ShmBufferMalloc(size_t bytes){ | ||||||
|   //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes |   //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes | ||||||
|   void *ptr = (void *)heap_top; |   void *ptr = (void *)heap_top; | ||||||
|   | |||||||
| @@ -75,9 +75,7 @@ public: | |||||||
|   static int           Hugepages; |   static int           Hugepages; | ||||||
|  |  | ||||||
|   static std::vector<void *> WorldShmCommBufs; |   static std::vector<void *> WorldShmCommBufs; | ||||||
| #ifndef ACCELERATOR_AWARE_MPI |  | ||||||
|   static void *HostCommBuf; |  | ||||||
| #endif |  | ||||||
|   static Grid_MPI_Comm WorldComm; |   static Grid_MPI_Comm WorldComm; | ||||||
|   static int           WorldRank; |   static int           WorldRank; | ||||||
|   static int           WorldSize; |   static int           WorldSize; | ||||||
| @@ -122,13 +120,6 @@ private: | |||||||
|   size_t heap_bytes; |   size_t heap_bytes; | ||||||
|   size_t heap_size; |   size_t heap_size; | ||||||
|  |  | ||||||
| #ifndef ACCELERATOR_AWARE_MPI |  | ||||||
|   size_t host_heap_top;  // set in free all |  | ||||||
|   size_t host_heap_bytes;// set in free all |  | ||||||
|   void *HostCommBuf;     // set in SetCommunicator |  | ||||||
|   size_t host_heap_size; // set in SetCommunicator |  | ||||||
| #endif |  | ||||||
|    |  | ||||||
| protected: | protected: | ||||||
|  |  | ||||||
|   Grid_MPI_Comm    ShmComm; // for barriers |   Grid_MPI_Comm    ShmComm; // for barriers | ||||||
| @@ -160,10 +151,7 @@ public: | |||||||
|   void *ShmBufferTranslate(int rank,void * local_p); |   void *ShmBufferTranslate(int rank,void * local_p); | ||||||
|   void *ShmBufferMalloc(size_t bytes); |   void *ShmBufferMalloc(size_t bytes); | ||||||
|   void  ShmBufferFreeAll(void) ; |   void  ShmBufferFreeAll(void) ; | ||||||
| #ifndef ACCELERATOR_AWARE_MPI |    | ||||||
|   void *HostBufferMalloc(size_t bytes); |  | ||||||
|   void HostBufferFreeAll(void); |  | ||||||
| #endif   |  | ||||||
|   ////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////// | ||||||
|   // Make info on Nodes & ranks and Shared memory available |   // Make info on Nodes & ranks and Shared memory available | ||||||
|   ////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -39,11 +39,9 @@ Author: Christoph Lehner <christoph@lhnr.de> | |||||||
| #include <hip/hip_runtime_api.h> | #include <hip/hip_runtime_api.h> | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_SYCL | #ifdef GRID_SYCL | ||||||
| #ifdef ACCELERATOR_AWARE_MPI |  | ||||||
| #define GRID_SYCL_LEVEL_ZERO_IPC | #define GRID_SYCL_LEVEL_ZERO_IPC | ||||||
| #define SHM_SOCKETS |  | ||||||
| #endif  |  | ||||||
| #include <syscall.h> | #include <syscall.h> | ||||||
|  | #define SHM_SOCKETS  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #include <sys/socket.h> | #include <sys/socket.h> | ||||||
| @@ -514,6 +512,46 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
| // Hugetlbfs mapping intended | // Hugetlbfs mapping intended | ||||||
| //////////////////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| #if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL) | #if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL) | ||||||
|  |  | ||||||
|  | //if defined(GRID_SYCL) | ||||||
|  | #if 0 | ||||||
|  | void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||||
|  | { | ||||||
|  |   void * ShmCommBuf ;  | ||||||
|  |   assert(_ShmSetup==1); | ||||||
|  |   assert(_ShmAlloc==0); | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // allocate the pointer array for shared windows for our group | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   MPI_Barrier(WorldShmComm); | ||||||
|  |   WorldShmCommBufs.resize(WorldShmSize); | ||||||
|  |  | ||||||
|  |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Each MPI rank should allocate our own buffer | ||||||
|  |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   ShmCommBuf = acceleratorAllocDevice(bytes); | ||||||
|  |  | ||||||
|  |   if (ShmCommBuf == (void *)NULL ) { | ||||||
|  |     std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; | ||||||
|  |     exit(EXIT_FAILURE);   | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes  | ||||||
|  | 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; | ||||||
|  |  | ||||||
|  |   SharedMemoryZero(ShmCommBuf,bytes); | ||||||
|  |  | ||||||
|  |   assert(WorldShmSize == 1); | ||||||
|  |   for(int r=0;r<WorldShmSize;r++){ | ||||||
|  |     WorldShmCommBufs[r] = ShmCommBuf; | ||||||
|  |   } | ||||||
|  |   _ShmAllocBytes=bytes; | ||||||
|  |   _ShmAlloc=1; | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)   | ||||||
| void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | ||||||
| { | { | ||||||
|   void * ShmCommBuf ;  |   void * ShmCommBuf ;  | ||||||
| @@ -536,9 +574,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   // Each MPI rank should allocate our own buffer |   // Each MPI rank should allocate our own buffer | ||||||
|   /////////////////////////////////////////////////////////////////////////////////////////////////////////// |   /////////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| #ifndef ACCELERATOR_AWARE_MPI |  | ||||||
|   HostCommBuf= malloc(bytes); |  | ||||||
| #endif   |  | ||||||
|   ShmCommBuf = acceleratorAllocDevice(bytes); |   ShmCommBuf = acceleratorAllocDevice(bytes); | ||||||
|   if (ShmCommBuf == (void *)NULL ) { |   if (ShmCommBuf == (void *)NULL ) { | ||||||
|     std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; |     std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; | ||||||
| @@ -703,6 +738,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) | |||||||
|   _ShmAllocBytes=bytes; |   _ShmAllocBytes=bytes; | ||||||
|   _ShmAlloc=1; |   _ShmAlloc=1; | ||||||
| } | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
| #else  | #else  | ||||||
| #ifdef GRID_MPI3_SHMMMAP | #ifdef GRID_MPI3_SHMMMAP | ||||||
| @@ -926,12 +962,6 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) | |||||||
|   } |   } | ||||||
|   ShmBufferFreeAll(); |   ShmBufferFreeAll(); | ||||||
|  |  | ||||||
| #ifndef ACCELERATOR_AWARE_MPI |  | ||||||
|   host_heap_size = heap_size; |  | ||||||
|   HostCommBuf= GlobalSharedMemory::HostCommBuf; |  | ||||||
|   HostBufferFreeAll(); |  | ||||||
| #endif   |  | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////////////// | ||||||
|   // find comm ranks in our SHM group (i.e. which ranks are on our node) |   // find comm ranks in our SHM group (i.e. which ranks are on our node) | ||||||
|   ///////////////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -29,27 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
| extern std::vector<std::pair<int,int> > Cshift_table;  | extern Vector<std::pair<int,int> > Cshift_table;  | ||||||
| extern commVector<std::pair<int,int> > Cshift_table_device;  |  | ||||||
|  |  | ||||||
| inline std::pair<int,int> *MapCshiftTable(void) |  | ||||||
| { |  | ||||||
|   // GPU version |  | ||||||
| #ifdef ACCELERATOR_CSHIFT     |  | ||||||
|   uint64_t sz=Cshift_table.size(); |  | ||||||
|   if (Cshift_table_device.size()!=sz )    { |  | ||||||
|     Cshift_table_device.resize(sz); |  | ||||||
|   } |  | ||||||
|   acceleratorCopyToDevice((void *)&Cshift_table[0], |  | ||||||
| 			  (void *)&Cshift_table_device[0], |  | ||||||
| 			  sizeof(Cshift_table[0])*sz); |  | ||||||
|  |  | ||||||
|   return &Cshift_table_device[0]; |  | ||||||
| #else  |  | ||||||
|   return &Cshift_table[0]; |  | ||||||
| #endif |  | ||||||
|   // CPU version use identify map |  | ||||||
| } |  | ||||||
| /////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////// | ||||||
| // Gather for when there is no need to SIMD split  | // Gather for when there is no need to SIMD split  | ||||||
| /////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////// | ||||||
| @@ -93,8 +74,8 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim | |||||||
|   } |   } | ||||||
|   { |   { | ||||||
|     auto buffer_p = & buffer[0]; |     auto buffer_p = & buffer[0]; | ||||||
|     auto table = MapCshiftTable(); |     auto table = &Cshift_table[0]; | ||||||
| #ifdef ACCELERATOR_CSHIFT | #ifdef ACCELERATOR_CSHIFT     | ||||||
|     autoView(rhs_v , rhs, AcceleratorRead); |     autoView(rhs_v , rhs, AcceleratorRead); | ||||||
|     accelerator_for(i,ent,vobj::Nsimd(),{ |     accelerator_for(i,ent,vobj::Nsimd(),{ | ||||||
| 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); | 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); | ||||||
| @@ -244,7 +225,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector< | |||||||
|    |    | ||||||
|   { |   { | ||||||
|     auto buffer_p = & buffer[0]; |     auto buffer_p = & buffer[0]; | ||||||
|     auto table = MapCshiftTable(); |     auto table = &Cshift_table[0]; | ||||||
| #ifdef ACCELERATOR_CSHIFT     | #ifdef ACCELERATOR_CSHIFT     | ||||||
|     autoView( rhs_v, rhs, AcceleratorWrite); |     autoView( rhs_v, rhs, AcceleratorWrite); | ||||||
|     accelerator_for(i,ent,vobj::Nsimd(),{ |     accelerator_for(i,ent,vobj::Nsimd(),{ | ||||||
| @@ -316,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) | ||||||
|  |  | ||||||
|  | template <typename T> | ||||||
|  | T iDivUp(T a, T b) // Round a / b to nearest higher integer value | ||||||
|  | { return (a % b != 0) ? (a / b + 1) : (a / b); } | ||||||
|  |  | ||||||
|  | template <typename T> | ||||||
|  | __global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride) | ||||||
|  | { | ||||||
|  |     int idx = blockIdx.x*blockDim.x + threadIdx.x; | ||||||
|  |     if (idx >= e1*e2) return; | ||||||
|  |  | ||||||
|  |     int n, b, o; | ||||||
|  |  | ||||||
|  |     n = idx / e2; | ||||||
|  |     b = idx % e2; | ||||||
|  |     o = n*stride + b; | ||||||
|  |  | ||||||
|  |     vector[2*idx + 0] = lo + o; | ||||||
|  |     vector[2*idx + 1] = ro + o; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #endif | ||||||
|  |  | ||||||
| ////////////////////////////////////////////////////// | ////////////////////////////////////////////////////// | ||||||
| // local to node block strided copies | // local to node block strided copies | ||||||
| ////////////////////////////////////////////////////// | ////////////////////////////////////////////////////// | ||||||
| @@ -340,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs | |||||||
|   int ent=0; |   int ent=0; | ||||||
|  |  | ||||||
|   if(cbmask == 0x3 ){ |   if(cbmask == 0x3 ){ | ||||||
|  | #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) | ||||||
|  |     ent = e1*e2; | ||||||
|  |     dim3 blockSize(acceleratorThreads()); | ||||||
|  |     dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); | ||||||
|  |     populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); | ||||||
|  |     accelerator_barrier(); | ||||||
|  | #else | ||||||
|     for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
|         int o =n*stride+b; |         int o =n*stride+b; | ||||||
| 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o); | 	Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  | #endif | ||||||
|   } else {  |   } else {  | ||||||
|     for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|       for(int b=0;b<e2;b++){ |       for(int b=0;b<e2;b++){ | ||||||
| @@ -359,7 +372,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   { |   { | ||||||
|     auto table = MapCshiftTable(); |     auto table = &Cshift_table[0]; | ||||||
| #ifdef ACCELERATOR_CSHIFT     | #ifdef ACCELERATOR_CSHIFT     | ||||||
|     autoView(rhs_v , rhs, AcceleratorRead); |     autoView(rhs_v , rhs, AcceleratorRead); | ||||||
|     autoView(lhs_v , lhs, AcceleratorWrite); |     autoView(lhs_v , lhs, AcceleratorWrite); | ||||||
| @@ -396,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo | |||||||
|   int ent=0; |   int ent=0; | ||||||
|  |  | ||||||
|   if ( cbmask == 0x3 ) { |   if ( cbmask == 0x3 ) { | ||||||
|  | #if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT) | ||||||
|  |     ent = e1*e2; | ||||||
|  |     dim3 blockSize(acceleratorThreads()); | ||||||
|  |     dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x)); | ||||||
|  |     populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride); | ||||||
|  |     accelerator_barrier(); | ||||||
|  | #else | ||||||
|     for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|     for(int b=0;b<e2;b++){ |     for(int b=0;b<e2;b++){ | ||||||
|       int o  =n*stride; |       int o  =n*stride; | ||||||
|       Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); |       Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); | ||||||
|     }} |     }} | ||||||
|  | #endif | ||||||
|   } else { |   } else { | ||||||
|     for(int n=0;n<e1;n++){ |     for(int n=0;n<e1;n++){ | ||||||
|     for(int b=0;b<e2;b++){ |     for(int b=0;b<e2;b++){ | ||||||
| @@ -411,7 +432,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo | |||||||
|   } |   } | ||||||
|  |  | ||||||
|   { |   { | ||||||
|     auto table = MapCshiftTable(); |     auto table = &Cshift_table[0]; | ||||||
| #ifdef ACCELERATOR_CSHIFT     | #ifdef ACCELERATOR_CSHIFT     | ||||||
|     autoView( rhs_v, rhs, AcceleratorRead); |     autoView( rhs_v, rhs, AcceleratorRead); | ||||||
|     autoView( lhs_v, lhs, AcceleratorWrite); |     autoView( lhs_v, lhs, AcceleratorWrite); | ||||||
|   | |||||||
| @@ -52,8 +52,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension | |||||||
|   int comm_dim        = rhs.Grid()->_processors[dimension] >1 ; |   int comm_dim        = rhs.Grid()->_processors[dimension] >1 ; | ||||||
|   int splice_dim      = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim); |   int splice_dim      = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim); | ||||||
|  |  | ||||||
|   RealD t1,t0; |  | ||||||
|   t0=usecond(); |  | ||||||
|   if ( !comm_dim ) { |   if ( !comm_dim ) { | ||||||
|     //std::cout << "CSHIFT: Cshift_local" <<std::endl; |     //std::cout << "CSHIFT: Cshift_local" <<std::endl; | ||||||
|     Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding |     Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding | ||||||
| @@ -64,8 +63,6 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension | |||||||
|     //std::cout << "CSHIFT: Cshift_comms" <<std::endl; |     //std::cout << "CSHIFT: Cshift_comms" <<std::endl; | ||||||
|     Cshift_comms(ret,rhs,dimension,shift); |     Cshift_comms(ret,rhs,dimension,shift); | ||||||
|   } |   } | ||||||
|   t1=usecond(); |  | ||||||
|   //  std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl; |  | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -130,20 +127,16 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|      |      | ||||||
|   int cb= (cbmask==0x2)? Odd : Even; |   int cb= (cbmask==0x2)? Odd : Even; | ||||||
|   int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); |   int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); | ||||||
|   RealD tcopy=0.0; |  | ||||||
|   RealD tgather=0.0; |  | ||||||
|   RealD tscatter=0.0; |  | ||||||
|   RealD tcomms=0.0; |  | ||||||
|   uint64_t xbytes=0; |  | ||||||
|   for(int x=0;x<rd;x++){        |   for(int x=0;x<rd;x++){        | ||||||
|  |  | ||||||
|     int sx        =  (x+sshift)%rd; |     int sx        =  (x+sshift)%rd; | ||||||
|     int comm_proc = ((x+sshift)/rd)%pd; |     int comm_proc = ((x+sshift)/rd)%pd; | ||||||
|      |      | ||||||
|     if (comm_proc==0) { |     if (comm_proc==0) { | ||||||
|       tcopy-=usecond(); |  | ||||||
|       Copy_plane(ret,rhs,dimension,x,sx,cbmask);  |       Copy_plane(ret,rhs,dimension,x,sx,cbmask);  | ||||||
|       tcopy+=usecond(); |  | ||||||
|     } else { |     } else { | ||||||
|  |  | ||||||
|       int words = buffer_size; |       int words = buffer_size; | ||||||
| @@ -151,39 +144,26 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|  |  | ||||||
|       int bytes = words * sizeof(vobj); |       int bytes = words * sizeof(vobj); | ||||||
|  |  | ||||||
|       tgather-=usecond(); |  | ||||||
|       Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); |       Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); | ||||||
|       tgather+=usecond(); |  | ||||||
|  |  | ||||||
|       //      int rank           = grid->_processor; |       //      int rank           = grid->_processor; | ||||||
|       int recv_from_rank; |       int recv_from_rank; | ||||||
|       int xmit_to_rank; |       int xmit_to_rank; | ||||||
|       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); |       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|        |  | ||||||
|       tcomms-=usecond(); |       grid->Barrier(); | ||||||
|       //      grid->Barrier(); |  | ||||||
|  |  | ||||||
|       grid->SendToRecvFrom((void *)&send_buf[0], |       grid->SendToRecvFrom((void *)&send_buf[0], | ||||||
| 			   xmit_to_rank, | 			   xmit_to_rank, | ||||||
| 			   (void *)&recv_buf[0], | 			   (void *)&recv_buf[0], | ||||||
| 			   recv_from_rank, | 			   recv_from_rank, | ||||||
| 			   bytes); | 			   bytes); | ||||||
|       xbytes+=bytes; |  | ||||||
|       //      grid->Barrier(); |  | ||||||
|       tcomms+=usecond(); |  | ||||||
|  |  | ||||||
|       tscatter-=usecond(); |       grid->Barrier(); | ||||||
|  |  | ||||||
|       Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); |       Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); | ||||||
|       tscatter+=usecond(); |  | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   /* |  | ||||||
|   std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; |  | ||||||
|   */ |  | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | ||||||
| @@ -210,12 +190,6 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|   assert(shift>=0); |   assert(shift>=0); | ||||||
|   assert(shift<fd); |   assert(shift<fd); | ||||||
|  |  | ||||||
|   RealD tcopy=0.0; |  | ||||||
|   RealD tgather=0.0; |  | ||||||
|   RealD tscatter=0.0; |  | ||||||
|   RealD tcomms=0.0; |  | ||||||
|   uint64_t xbytes=0; |  | ||||||
|    |  | ||||||
|   int permute_type=grid->PermuteType(dimension); |   int permute_type=grid->PermuteType(dimension); | ||||||
|  |  | ||||||
|   /////////////////////////////////////////////// |   /////////////////////////////////////////////// | ||||||
| @@ -253,9 +227,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|       pointers[i] = &send_buf_extract[i][0]; |       pointers[i] = &send_buf_extract[i][0]; | ||||||
|     } |     } | ||||||
|     int sx   = (x+sshift)%rd; |     int sx   = (x+sshift)%rd; | ||||||
|     tgather-=usecond(); |  | ||||||
|     Gather_plane_extract(rhs,pointers,dimension,sx,cbmask); |     Gather_plane_extract(rhs,pointers,dimension,sx,cbmask); | ||||||
|     tgather+=usecond(); |  | ||||||
|  |  | ||||||
|     for(int i=0;i<Nsimd;i++){ |     for(int i=0;i<Nsimd;i++){ | ||||||
|        |        | ||||||
| @@ -280,8 +252,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|       if(nbr_proc){ |       if(nbr_proc){ | ||||||
| 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | ||||||
|  |  | ||||||
| 	tcomms-=usecond(); | 	grid->Barrier(); | ||||||
| 	//	grid->Barrier(); |  | ||||||
|  |  | ||||||
| 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; | 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; | ||||||
| 	recv_buf_extract_mpi = &recv_buf_extract[i][0]; | 	recv_buf_extract_mpi = &recv_buf_extract[i][0]; | ||||||
| @@ -291,9 +262,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
| 			     recv_from_rank, | 			     recv_from_rank, | ||||||
| 			     bytes); | 			     bytes); | ||||||
|  |  | ||||||
| 	xbytes+=bytes; | 	grid->Barrier(); | ||||||
| 	//	grid->Barrier(); |  | ||||||
| 	tcomms+=usecond(); |  | ||||||
|  |  | ||||||
| 	rpointers[i] = &recv_buf_extract[i][0]; | 	rpointers[i] = &recv_buf_extract[i][0]; | ||||||
|       } else {  |       } else {  | ||||||
| @@ -301,17 +270,9 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|       } |       } | ||||||
|  |  | ||||||
|     } |     } | ||||||
|     tscatter-=usecond(); |  | ||||||
|     Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); |     Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); | ||||||
|     tscatter+=usecond(); |  | ||||||
|   } |   } | ||||||
|   /* |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; |  | ||||||
|   */ |  | ||||||
| } | } | ||||||
| #else  | #else  | ||||||
| template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | ||||||
| @@ -331,11 +292,6 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|   assert(comm_dim==1); |   assert(comm_dim==1); | ||||||
|   assert(shift>=0); |   assert(shift>=0); | ||||||
|   assert(shift<fd); |   assert(shift<fd); | ||||||
|   RealD tcopy=0.0; |  | ||||||
|   RealD tgather=0.0; |  | ||||||
|   RealD tscatter=0.0; |  | ||||||
|   RealD tcomms=0.0; |  | ||||||
|   uint64_t xbytes=0; |  | ||||||
|    |    | ||||||
|   int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; |   int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; | ||||||
|   static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size); |   static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size); | ||||||
| @@ -359,9 +315,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|      |      | ||||||
|     if (comm_proc==0) { |     if (comm_proc==0) { | ||||||
|  |  | ||||||
|       tcopy-=usecond(); |  | ||||||
|       Copy_plane(ret,rhs,dimension,x,sx,cbmask);  |       Copy_plane(ret,rhs,dimension,x,sx,cbmask);  | ||||||
|       tcopy+=usecond(); |  | ||||||
|  |  | ||||||
|     } else { |     } else { | ||||||
|  |  | ||||||
| @@ -370,9 +324,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|  |  | ||||||
|       int bytes = words * sizeof(vobj); |       int bytes = words * sizeof(vobj); | ||||||
|  |  | ||||||
|       tgather-=usecond(); |  | ||||||
|       Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); |       Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); | ||||||
|       tgather+=usecond(); |  | ||||||
|  |  | ||||||
|       //      int rank           = grid->_processor; |       //      int rank           = grid->_processor; | ||||||
|       int recv_from_rank; |       int recv_from_rank; | ||||||
| @@ -380,8 +332,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
|       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); |       grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|  |  | ||||||
|  |  | ||||||
|       tcomms-=usecond(); |       grid->Barrier(); | ||||||
|       //      grid->Barrier(); |  | ||||||
|  |  | ||||||
|       acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); |       acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); | ||||||
|       grid->SendToRecvFrom((void *)&send_buf[0], |       grid->SendToRecvFrom((void *)&send_buf[0], | ||||||
| @@ -389,24 +340,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r | |||||||
| 			   (void *)&recv_buf[0], | 			   (void *)&recv_buf[0], | ||||||
| 			   recv_from_rank, | 			   recv_from_rank, | ||||||
| 			   bytes); | 			   bytes); | ||||||
|       xbytes+=bytes; |  | ||||||
|       acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); |       acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); | ||||||
|  |  | ||||||
|       //      grid->Barrier(); |       grid->Barrier(); | ||||||
|       tcomms+=usecond(); |  | ||||||
|  |  | ||||||
|       tscatter-=usecond(); |  | ||||||
|       Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); |       Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); | ||||||
|       tscatter+=usecond(); |  | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   /* |  | ||||||
|   std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; |  | ||||||
|   */ |  | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) | ||||||
| @@ -432,11 +372,6 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|   assert(simd_layout==2); |   assert(simd_layout==2); | ||||||
|   assert(shift>=0); |   assert(shift>=0); | ||||||
|   assert(shift<fd); |   assert(shift<fd); | ||||||
|   RealD tcopy=0.0; |  | ||||||
|   RealD tgather=0.0; |  | ||||||
|   RealD tscatter=0.0; |  | ||||||
|   RealD tcomms=0.0; |  | ||||||
|   uint64_t xbytes=0; |  | ||||||
|  |  | ||||||
|   int permute_type=grid->PermuteType(dimension); |   int permute_type=grid->PermuteType(dimension); | ||||||
|  |  | ||||||
| @@ -479,10 +414,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|     for(int i=0;i<Nsimd;i++){        |     for(int i=0;i<Nsimd;i++){        | ||||||
|       pointers[i] = &send_buf_extract[i][0]; |       pointers[i] = &send_buf_extract[i][0]; | ||||||
|     } |     } | ||||||
|     tgather-=usecond(); |  | ||||||
|     int sx   = (x+sshift)%rd; |     int sx   = (x+sshift)%rd; | ||||||
|     Gather_plane_extract(rhs,pointers,dimension,sx,cbmask); |     Gather_plane_extract(rhs,pointers,dimension,sx,cbmask); | ||||||
|     tgather+=usecond(); |  | ||||||
|  |  | ||||||
|     for(int i=0;i<Nsimd;i++){ |     for(int i=0;i<Nsimd;i++){ | ||||||
|        |        | ||||||
| @@ -507,8 +440,7 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
|       if(nbr_proc){ |       if(nbr_proc){ | ||||||
| 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | 	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);  | ||||||
|  |  | ||||||
| 	tcomms-=usecond(); | 	grid->Barrier(); | ||||||
| 	//	grid->Barrier(); |  | ||||||
|  |  | ||||||
| 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); | 	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); | ||||||
| 	grid->SendToRecvFrom((void *)send_buf_extract_mpi, | 	grid->SendToRecvFrom((void *)send_buf_extract_mpi, | ||||||
| @@ -517,28 +449,17 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo | |||||||
| 			     recv_from_rank, | 			     recv_from_rank, | ||||||
| 			     bytes); | 			     bytes); | ||||||
| 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); | 	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); | ||||||
| 	xbytes+=bytes; |  | ||||||
|  |  | ||||||
| 	//	grid->Barrier(); | 	grid->Barrier(); | ||||||
| 	tcomms+=usecond(); |  | ||||||
| 	rpointers[i] = &recv_buf_extract[i][0]; | 	rpointers[i] = &recv_buf_extract[i][0]; | ||||||
|       } else {  |       } else {  | ||||||
| 	rpointers[i] = &send_buf_extract[nbr_lane][0]; | 	rpointers[i] = &send_buf_extract[nbr_lane][0]; | ||||||
|       } |       } | ||||||
|  |  | ||||||
|     } |     } | ||||||
|     tscatter-=usecond(); |  | ||||||
|     Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); |     Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); | ||||||
|     tscatter+=usecond(); |  | ||||||
|  |  | ||||||
|   } |   } | ||||||
|   /* |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl; |  | ||||||
|   std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl; |  | ||||||
|   */ |  | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| NAMESPACE_END(Grid);  | NAMESPACE_END(Grid);  | ||||||
|   | |||||||
| @@ -1,5 +1,4 @@ | |||||||
| #include <Grid/GridCore.h>        | #include <Grid/GridCore.h>        | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
| std::vector<std::pair<int,int> > Cshift_table;  | Vector<std::pair<int,int> > Cshift_table;  | ||||||
| commVector<std::pair<int,int> > Cshift_table_device;  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|   | |||||||
| @@ -35,7 +35,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <Grid/lattice/Lattice_transpose.h> | #include <Grid/lattice/Lattice_transpose.h> | ||||||
| #include <Grid/lattice/Lattice_local.h> | #include <Grid/lattice/Lattice_local.h> | ||||||
| #include <Grid/lattice/Lattice_reduction.h> | #include <Grid/lattice/Lattice_reduction.h> | ||||||
| #include <Grid/lattice/Lattice_crc.h> |  | ||||||
| #include <Grid/lattice/Lattice_peekpoke.h> | #include <Grid/lattice/Lattice_peekpoke.h> | ||||||
| #include <Grid/lattice/Lattice_reality.h> | #include <Grid/lattice/Lattice_reality.h> | ||||||
| #include <Grid/lattice/Lattice_real_imag.h> | #include <Grid/lattice/Lattice_real_imag.h> | ||||||
| @@ -47,4 +46,5 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <Grid/lattice/Lattice_unary.h> | #include <Grid/lattice/Lattice_unary.h> | ||||||
| #include <Grid/lattice/Lattice_transfer.h> | #include <Grid/lattice/Lattice_transfer.h> | ||||||
| #include <Grid/lattice/Lattice_basis.h> | #include <Grid/lattice/Lattice_basis.h> | ||||||
|  | #include <Grid/lattice/Lattice_crc.h> | ||||||
| #include <Grid/lattice/PaddedCell.h> | #include <Grid/lattice/PaddedCell.h> | ||||||
|   | |||||||
| @@ -270,42 +270,5 @@ RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const L | |||||||
|     return axpby_norm_fast(ret,a,b,x,y); |     return axpby_norm_fast(ret,a,b,x,y); | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Trace product |  | ||||||
| template<class obj> auto traceProduct(const Lattice<obj> &rhs_1,const Lattice<obj> &rhs_2) |  | ||||||
|   -> Lattice<decltype(trace(obj()))> |  | ||||||
| { |  | ||||||
|   typedef decltype(trace(obj())) robj; |  | ||||||
|   Lattice<robj> ret_i(rhs_1.Grid()); |  | ||||||
|   autoView( rhs1 , rhs_1, AcceleratorRead); |  | ||||||
|   autoView( rhs2 , rhs_2, AcceleratorRead); |  | ||||||
|   autoView( ret , ret_i, AcceleratorWrite); |  | ||||||
|   ret.Checkerboard() = rhs_1.Checkerboard(); |  | ||||||
|   accelerator_for(ss,rhs1.size(),obj::Nsimd(),{ |  | ||||||
|       coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2(ss))); |  | ||||||
|   }); |  | ||||||
|   return ret_i; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<class obj1,class obj2> auto traceProduct(const Lattice<obj1> &rhs_1,const obj2 &rhs2) |  | ||||||
|   -> Lattice<decltype(trace(obj1()))> |  | ||||||
| { |  | ||||||
|   typedef decltype(trace(obj1())) robj; |  | ||||||
|   Lattice<robj> ret_i(rhs_1.Grid()); |  | ||||||
|   autoView( rhs1 , rhs_1, AcceleratorRead); |  | ||||||
|   autoView( ret , ret_i, AcceleratorWrite); |  | ||||||
|   ret.Checkerboard() = rhs_1.Checkerboard(); |  | ||||||
|   accelerator_for(ss,rhs1.size(),obj1::Nsimd(),{ |  | ||||||
|       coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2)); |  | ||||||
|   }); |  | ||||||
|   return ret_i; |  | ||||||
| } |  | ||||||
| template<class obj1,class obj2> auto traceProduct(const obj2 &rhs_2,const Lattice<obj1> &rhs_1) |  | ||||||
|   -> Lattice<decltype(trace(obj1()))> |  | ||||||
| { |  | ||||||
|   return traceProduct(rhs_1,rhs_2); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) | |||||||
|     basis_v.push_back(basis[k].View(AcceleratorWrite)); |     basis_v.push_back(basis[k].View(AcceleratorWrite)); | ||||||
|   } |   } | ||||||
|  |  | ||||||
| #if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) ) | #if ( (!defined(GRID_CUDA)) ) | ||||||
|   int max_threads = thread_max(); |   int max_threads = thread_max(); | ||||||
|   Vector < vobj > Bt(Nm * max_threads); |   Vector < vobj > Bt(Nm * max_threads); | ||||||
|   thread_region |   thread_region | ||||||
|   | |||||||
| @@ -42,13 +42,13 @@ template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1 | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class vobj> uint32_t crc(const Lattice<vobj> & buf) | template<class vobj> uint32_t crc(Lattice<vobj> & buf) | ||||||
| { | { | ||||||
|   autoView( buf_v , buf, CpuRead); |   autoView( buf_v , buf, CpuRead); | ||||||
|   return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites()); |   return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites()); | ||||||
| } | } | ||||||
|  |  | ||||||
| #define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl; | #define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl; | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -31,7 +31,6 @@ Author: Christoph Lehner <christoph@lhnr.de> | |||||||
| #if defined(GRID_SYCL) | #if defined(GRID_SYCL) | ||||||
| #include <Grid/lattice/Lattice_reduction_sycl.h> | #include <Grid/lattice/Lattice_reduction_sycl.h> | ||||||
| #endif | #endif | ||||||
| #include <Grid/lattice/Lattice_slicesum_core.h> |  | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
| @@ -281,29 +280,11 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> & | |||||||
|   return nrm; |   return nrm; | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
| inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { | inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { | ||||||
|   GridBase *grid = left.Grid(); |   GridBase *grid = left.Grid(); | ||||||
|  |  | ||||||
| #ifdef GRID_SYCL |  | ||||||
|   uint64_t csum=0; |  | ||||||
|   if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) |  | ||||||
|   { |  | ||||||
|     // Hack |  | ||||||
|     // Fast integer xor checksum. Can also be used in comms now. |  | ||||||
|     autoView(l_v,left,AcceleratorRead); |  | ||||||
|     Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); |  | ||||||
|     uint64_t *base= (uint64_t *)&l_v[0]; |  | ||||||
|     csum=svm_xor(base,words); |  | ||||||
|   } |  | ||||||
|   FlightRecorder::CsumLog(csum); |  | ||||||
| #endif |  | ||||||
|   ComplexD nrm = rankInnerProduct(left,right); |   ComplexD nrm = rankInnerProduct(left,right); | ||||||
|   RealD local = real(nrm); |  | ||||||
|   FlightRecorder::NormLog(real(nrm));  |  | ||||||
|   grid->GlobalSum(nrm); |   grid->GlobalSum(nrm); | ||||||
|   FlightRecorder::ReductionLog(local,real(nrm));  |  | ||||||
|   return nrm; |   return nrm; | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -467,10 +448,19 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector< | |||||||
|   int e1=    grid->_slice_nblock[orthogdim]; |   int e1=    grid->_slice_nblock[orthogdim]; | ||||||
|   int e2=    grid->_slice_block [orthogdim]; |   int e2=    grid->_slice_block [orthogdim]; | ||||||
|   int stride=grid->_slice_stride[orthogdim]; |   int stride=grid->_slice_stride[orthogdim]; | ||||||
|   int ostride=grid->_ostride[orthogdim]; |  | ||||||
|    |   // sum over reduced dimension planes, breaking out orthog dir | ||||||
|   //Reduce Data down to lvSum |   // Parallel over orthog direction | ||||||
|   sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd); |   autoView( Data_v, Data, CpuRead); | ||||||
|  |   thread_for( r,rd, { | ||||||
|  |     int so=r*grid->_ostride[orthogdim]; // base offset for start of plane  | ||||||
|  |     for(int n=0;n<e1;n++){ | ||||||
|  |       for(int b=0;b<e2;b++){ | ||||||
|  | 	int ss= so+n*stride+b; | ||||||
|  | 	lvSum[r]=lvSum[r]+Data_v[ss]; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   }); | ||||||
|  |  | ||||||
|   // Sum across simd lanes in the plane, breaking out orthog dir. |   // Sum across simd lanes in the plane, breaking out orthog dir. | ||||||
|   Coordinate icoor(Nd); |   Coordinate icoor(Nd); | ||||||
| @@ -514,7 +504,6 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim) | |||||||
|   return result; |   return result; | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
| static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)  | static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)  | ||||||
| { | { | ||||||
|   | |||||||
| @@ -30,7 +30,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator & | |||||||
|   cudaGetDevice(&device); |   cudaGetDevice(&device); | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_HIP | #ifdef GRID_HIP | ||||||
|   auto r=hipGetDevice(&device); |   hipGetDevice(&device); | ||||||
| #endif | #endif | ||||||
|    |    | ||||||
|   Iterator warpSize            = gpu_props[device].warpSize; |   Iterator warpSize            = gpu_props[device].warpSize; | ||||||
|   | |||||||
| @@ -69,29 +69,28 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite | |||||||
|   return result; |   return result; | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class Word> Word svm_xor(Word *vec,uint64_t L) |  | ||||||
| { |  | ||||||
|   Word xorResult; xorResult = 0; |  | ||||||
|   Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator); |  | ||||||
|   Word identity;  identity=0; |  | ||||||
|   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { |  | ||||||
|      auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>()); |  | ||||||
|      cgh.parallel_for(cl::sycl::range<1>{L}, |  | ||||||
| 		      Reduction, |  | ||||||
| 		      [=] (cl::sycl::id<1> index, auto &sum) { |  | ||||||
| 	 sum ^=vec[index]; |  | ||||||
|      }); |  | ||||||
|    }); |  | ||||||
|   theGridAccelerator->wait(); |  | ||||||
|   Word ret = d_sum[0]; |  | ||||||
|   free(d_sum,*theGridAccelerator); |  | ||||||
|   return ret; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
| /* | /* | ||||||
|  | template<class Double> Double svm_reduce(Double *vec,uint64_t L) | ||||||
|  | { | ||||||
|  |   Double sumResult; zeroit(sumResult); | ||||||
|  |   Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator); | ||||||
|  |   Double identity;  zeroit(identity); | ||||||
|  |   theGridAccelerator->submit([&](cl::sycl::handler &cgh) { | ||||||
|  |      auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>()); | ||||||
|  |      cgh.parallel_for(cl::sycl::range<1>{L}, | ||||||
|  | 		      Reduction, | ||||||
|  | 		      [=] (cl::sycl::id<1> index, auto &sum) { | ||||||
|  | 	 sum +=vec[index]; | ||||||
|  |      }); | ||||||
|  |    }); | ||||||
|  |   theGridAccelerator->wait(); | ||||||
|  |   Double ret = d_sum[0]; | ||||||
|  |   free(d_sum,*theGridAccelerator); | ||||||
|  |   std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl; | ||||||
|  |   return ret; | ||||||
|  | } | ||||||
|  |  | ||||||
| template <class vobj> | template <class vobj> | ||||||
| inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites) | inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites) | ||||||
|   | |||||||
| @@ -152,7 +152,6 @@ public: | |||||||
| #ifdef RNG_FAST_DISCARD | #ifdef RNG_FAST_DISCARD | ||||||
|   static void Skip(RngEngine &eng,uint64_t site) |   static void Skip(RngEngine &eng,uint64_t site) | ||||||
|   { |   { | ||||||
| #if 0 |  | ||||||
|     ///////////////////////////////////////////////////////////////////////////////////// |     ///////////////////////////////////////////////////////////////////////////////////// | ||||||
|     // Skip by 2^40 elements between successive lattice sites |     // Skip by 2^40 elements between successive lattice sites | ||||||
|     // This goes by 10^12. |     // This goes by 10^12. | ||||||
| @@ -163,9 +162,9 @@ public: | |||||||
|     // tens of seconds per trajectory so this is clean in all reasonable cases, |     // tens of seconds per trajectory so this is clean in all reasonable cases, | ||||||
|     // and margin of safety is orders of magnitude. |     // and margin of safety is orders of magnitude. | ||||||
|     // We could hack Sitmo to skip in the higher order words of state if necessary |     // We could hack Sitmo to skip in the higher order words of state if necessary | ||||||
|     // |       // | ||||||
|     // Replace with 2^30 ; avoid problem on large volumes |       // Replace with 2^30 ; avoid problem on large volumes | ||||||
|     // |       // | ||||||
|     ///////////////////////////////////////////////////////////////////////////////////// |     ///////////////////////////////////////////////////////////////////////////////////// | ||||||
|     //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init |     //      uint64_t skip = site+1;  //   Old init Skipped then drew.  Checked compat with faster init | ||||||
|     const int shift = 30; |     const int shift = 30; | ||||||
| @@ -180,9 +179,6 @@ public: | |||||||
|     assert((skip >> shift)==site); // check for overflow |     assert((skip >> shift)==site); // check for overflow | ||||||
|  |  | ||||||
|     eng.discard(skip); |     eng.discard(skip); | ||||||
| #else |  | ||||||
|     eng.discardhi(site); |  | ||||||
| #endif |  | ||||||
|     //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl; |     //      std::cout << " Engine  " <<site << " state " <<eng<<std::endl; | ||||||
|   }  |   }  | ||||||
| #endif | #endif | ||||||
| @@ -411,7 +407,7 @@ public: | |||||||
|       std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl; |       std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl; | ||||||
|       SeedFixedIntegers(seeds); |       SeedFixedIntegers(seeds); | ||||||
|     } |     } | ||||||
|   void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){ |   void SeedFixedIntegers(const std::vector<int> &seeds){ | ||||||
|  |  | ||||||
|     // Everyone generates the same seed_seq based on input seeds |     // Everyone generates the same seed_seq based on input seeds | ||||||
|     CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); |     CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size()); | ||||||
| @@ -428,6 +424,7 @@ public: | |||||||
|     // MT implementation does not implement fast discard even though |     // MT implementation does not implement fast discard even though | ||||||
|     // in principle this is possible |     // in principle this is possible | ||||||
|     //////////////////////////////////////////////// |     //////////////////////////////////////////////// | ||||||
|  | #if 1 | ||||||
|     thread_for( lidx, _grid->lSites(), { |     thread_for( lidx, _grid->lSites(), { | ||||||
|  |  | ||||||
| 	int gidx; | 	int gidx; | ||||||
| @@ -448,12 +445,29 @@ public: | |||||||
| 	 | 	 | ||||||
| 	int l_idx=generator_idx(o_idx,i_idx); | 	int l_idx=generator_idx(o_idx,i_idx); | ||||||
| 	_generators[l_idx] = master_engine; | 	_generators[l_idx] = master_engine; | ||||||
| 	if ( britney ) {  | 	Skip(_generators[l_idx],gidx); // Skip to next RNG sequence | ||||||
| 	  Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence |     }); | ||||||
| 	} else { 	 | #else | ||||||
|  |     // Everybody loops over global volume. | ||||||
|  |     thread_for( gidx, _grid->_gsites, { | ||||||
|  |  | ||||||
|  | 	// Where is it? | ||||||
|  | 	int rank; | ||||||
|  | 	int o_idx; | ||||||
|  | 	int i_idx; | ||||||
|  |  | ||||||
|  | 	Coordinate gcoor; | ||||||
|  | 	_grid->GlobalIndexToGlobalCoor(gidx,gcoor); | ||||||
|  | 	_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); | ||||||
|  | 	 | ||||||
|  | 	// If this is one of mine we take it | ||||||
|  | 	if( rank == _grid->ThisRank() ){ | ||||||
|  | 	  int l_idx=generator_idx(o_idx,i_idx); | ||||||
|  | 	  _generators[l_idx] = master_engine; | ||||||
| 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence | 	  Skip(_generators[l_idx],gidx); // Skip to next RNG sequence | ||||||
| 	} | 	} | ||||||
|     }); |     }); | ||||||
|  | #endif | ||||||
| #else  | #else  | ||||||
|     //////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////// | ||||||
|     // Machine and thread decomposition dependent seeding is efficient |     // Machine and thread decomposition dependent seeding is efficient | ||||||
|   | |||||||
| @@ -1,224 +0,0 @@ | |||||||
| #pragma once |  | ||||||
|  |  | ||||||
| #if defined(GRID_CUDA) |  | ||||||
|  |  | ||||||
| #include <cub/cub.cuh> |  | ||||||
| #define gpucub cub |  | ||||||
| #define gpuError_t cudaError_t |  | ||||||
| #define gpuSuccess cudaSuccess |  | ||||||
|  |  | ||||||
| #elif defined(GRID_HIP) |  | ||||||
|  |  | ||||||
| #include <hipcub/hipcub.hpp> |  | ||||||
| #define gpucub hipcub |  | ||||||
| #define gpuError_t hipError_t |  | ||||||
| #define gpuSuccess hipSuccess |  | ||||||
|  |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #if defined(GRID_CUDA) || defined(GRID_HIP) |  | ||||||
| template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { |  | ||||||
|   size_t subvol_size = e1*e2; |  | ||||||
|   commVector<vobj> reduction_buffer(rd*subvol_size); |  | ||||||
|   auto rb_p = &reduction_buffer[0]; |  | ||||||
|   vobj zero_init; |  | ||||||
|   zeroit(zero_init); |  | ||||||
|  |  | ||||||
|    |  | ||||||
|   void *temp_storage_array = NULL; |  | ||||||
|   size_t temp_storage_bytes = 0; |  | ||||||
|   vobj *d_out; |  | ||||||
|   int* d_offsets; |  | ||||||
|  |  | ||||||
|   std::vector<int> offsets(rd+1,0); |  | ||||||
|  |  | ||||||
|   for (int i = 0; i < offsets.size(); i++) { |  | ||||||
|     offsets[i] = i*subvol_size; |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   //Allocate memory for output and offset arrays on device |  | ||||||
|   d_out = static_cast<vobj*>(acceleratorAllocDevice(rd*sizeof(vobj))); |  | ||||||
|    |  | ||||||
|   d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int))); |  | ||||||
|    |  | ||||||
|   //copy offsets to device |  | ||||||
|   acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream); |  | ||||||
|    |  | ||||||
|    |  | ||||||
|   gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream); |  | ||||||
|   if (gpuErr!=gpuSuccess) { |  | ||||||
|     std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl; |  | ||||||
|     exit(EXIT_FAILURE); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   //allocate memory for temp_storage_array   |  | ||||||
|   temp_storage_array = acceleratorAllocDevice(temp_storage_bytes); |  | ||||||
|    |  | ||||||
|   //prepare buffer for reduction |  | ||||||
|   //use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream) |  | ||||||
|   //use 2d accelerator_for to avoid launch latencies found when serially looping over rd  |  | ||||||
|   accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{  |  | ||||||
|    |  | ||||||
|     int n = s / e2; |  | ||||||
|     int b = s % e2; |  | ||||||
|     int so=r*ostride; // base offset for start of plane  |  | ||||||
|     int ss= so+n*stride+b; |  | ||||||
|  |  | ||||||
|     coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss])); |  | ||||||
|  |  | ||||||
|   }); |  | ||||||
|    |  | ||||||
|   //issue segmented reductions in computeStream |  | ||||||
|   gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream); |  | ||||||
|   if (gpuErr!=gpuSuccess) { |  | ||||||
|     std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl; |  | ||||||
|     exit(EXIT_FAILURE); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream); |  | ||||||
|    |  | ||||||
|   //sync after copy |  | ||||||
|   accelerator_barrier(); |  | ||||||
|   |  | ||||||
|   acceleratorFreeDevice(temp_storage_array); |  | ||||||
|   acceleratorFreeDevice(d_out); |  | ||||||
|   acceleratorFreeDevice(d_offsets); |  | ||||||
|    |  | ||||||
|  |  | ||||||
| } |  | ||||||
| #endif  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #if defined(GRID_SYCL) |  | ||||||
| template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int  &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) |  | ||||||
| { |  | ||||||
|   size_t subvol_size = e1*e2; |  | ||||||
|  |  | ||||||
|   vobj *mysum = (vobj *) malloc_shared(rd*sizeof(vobj),*theGridAccelerator); |  | ||||||
|   vobj vobj_zero; |  | ||||||
|   zeroit(vobj_zero); |  | ||||||
|   for (int r = 0; r<rd; r++) {  |  | ||||||
|     mysum[r] = vobj_zero;  |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   commVector<vobj> reduction_buffer(rd*subvol_size);     |  | ||||||
|  |  | ||||||
|   auto rb_p = &reduction_buffer[0]; |  | ||||||
|  |  | ||||||
|   // autoView(Data_v, Data, AcceleratorRead); |  | ||||||
|  |  | ||||||
|   //prepare reduction buffer  |  | ||||||
|   accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{  |  | ||||||
|    |  | ||||||
|       int n = s / e2; |  | ||||||
|       int b = s % e2; |  | ||||||
|       int so=r*ostride; // base offset for start of plane  |  | ||||||
|       int ss= so+n*stride+b; |  | ||||||
|  |  | ||||||
|       coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss])); |  | ||||||
|  |  | ||||||
|   }); |  | ||||||
|  |  | ||||||
|   for (int r = 0; r < rd; r++) { |  | ||||||
|       theGridAccelerator->submit([&](cl::sycl::handler &cgh) { |  | ||||||
|           auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>()); |  | ||||||
|           cgh.parallel_for(cl::sycl::range<1>{subvol_size}, |  | ||||||
|           Reduction, |  | ||||||
|           [=](cl::sycl::id<1> item, auto &sum) { |  | ||||||
|               auto s = item[0]; |  | ||||||
|               sum += rb_p[r*subvol_size+s]; |  | ||||||
|           }); |  | ||||||
|       }); |  | ||||||
|        |  | ||||||
|       |  | ||||||
|   } |  | ||||||
|   theGridAccelerator->wait(); |  | ||||||
|   for (int r = 0; r < rd; r++) { |  | ||||||
|     lvSum[r] = mysum[r]; |  | ||||||
|   } |  | ||||||
|   free(mysum,*theGridAccelerator); |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) { |  | ||||||
|   typedef typename vobj::vector_type vector; |  | ||||||
|   const int words = sizeof(vobj)/sizeof(vector); |  | ||||||
|   const int osites = rd*e1*e2; |  | ||||||
|   commVector<vector>buffer(osites); |  | ||||||
|   vector *dat = (vector *)Data; |  | ||||||
|   vector *buf = &buffer[0]; |  | ||||||
|   Vector<vector> lvSum_small(rd); |  | ||||||
|   vector *lvSum_ptr = (vector *)&lvSum[0]; |  | ||||||
|  |  | ||||||
|   for (int w = 0; w < words; w++) { |  | ||||||
|     accelerator_for(ss,osites,1,{ |  | ||||||
| 	    buf[ss] = dat[ss*words+w]; |  | ||||||
|     }); |  | ||||||
|  |  | ||||||
|     #if defined(GRID_CUDA) || defined(GRID_HIP) |  | ||||||
|       sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); |  | ||||||
|     #elif defined(GRID_SYCL) |  | ||||||
|       sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd); |  | ||||||
|     #endif |  | ||||||
|  |  | ||||||
|     for (int r = 0; r < rd; r++) { |  | ||||||
|       lvSum_ptr[w+words*r]=lvSum_small[r]; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|    |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) |  | ||||||
| { |  | ||||||
|   autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. |  | ||||||
|     if constexpr (sizeof(vobj) <= 256) {  |  | ||||||
|  |  | ||||||
|       #if defined(GRID_CUDA) || defined(GRID_HIP) |  | ||||||
|         sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); |  | ||||||
|       #elif defined (GRID_SYCL) |  | ||||||
|         sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); |  | ||||||
|       #endif |  | ||||||
|  |  | ||||||
|     } |  | ||||||
|     else { |  | ||||||
|       sliceSumReduction_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd) |  | ||||||
| { |  | ||||||
|   // sum over reduced dimension planes, breaking out orthog dir |  | ||||||
|   // Parallel over orthog direction |  | ||||||
|   autoView( Data_v, Data, CpuRead); |  | ||||||
|   thread_for( r,rd, { |  | ||||||
|     int so=r*ostride; // base offset for start of plane  |  | ||||||
|     for(int n=0;n<e1;n++){ |  | ||||||
|       for(int b=0;b<e2;b++){ |  | ||||||
|         int ss= so+n*stride+b; |  | ||||||
|         lvSum[r]=lvSum[r]+Data_v[ss]; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|   }); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)  |  | ||||||
| { |  | ||||||
|   #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) |  | ||||||
|    |  | ||||||
|   sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |  | ||||||
|    |  | ||||||
|   #else |  | ||||||
|   sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); |  | ||||||
|  |  | ||||||
|   #endif |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); |  | ||||||
| @@ -469,13 +469,15 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData) | |||||||
|   Coordinate fine_rdimensions = fine->_rdimensions; |   Coordinate fine_rdimensions = fine->_rdimensions; | ||||||
|   Coordinate coarse_rdimensions = coarse->_rdimensions; |   Coordinate coarse_rdimensions = coarse->_rdimensions; | ||||||
|  |  | ||||||
|  |   vobj zz = Zero(); | ||||||
|  |    | ||||||
|   accelerator_for(sc,coarse->oSites(),1,{ |   accelerator_for(sc,coarse->oSites(),1,{ | ||||||
|  |  | ||||||
|       // One thread per sub block |       // One thread per sub block | ||||||
|       Coordinate coor_c(_ndimension); |       Coordinate coor_c(_ndimension); | ||||||
|       Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate |       Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions);  // Block coordinate | ||||||
|  |  | ||||||
|       vobj cd = Zero(); |       vobj cd = zz; | ||||||
|        |        | ||||||
|       for(int sb=0;sb<blockVol;sb++){ |       for(int sb=0;sb<blockVol;sb++){ | ||||||
|  |  | ||||||
|   | |||||||
| @@ -45,7 +45,6 @@ public: | |||||||
|   }; |   }; | ||||||
|   // Host only |   // Host only | ||||||
|   GridBase * getGrid(void) const { return _grid; }; |   GridBase * getGrid(void) const { return _grid; }; | ||||||
|   vobj* getHostPointer(void) const { return _odata; }; |  | ||||||
| }; | }; | ||||||
|  |  | ||||||
| ///////////////////////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -179,11 +179,11 @@ extern GridLogger GridLogSolver; | |||||||
| extern GridLogger GridLogError; | extern GridLogger GridLogError; | ||||||
| extern GridLogger GridLogWarning; | extern GridLogger GridLogWarning; | ||||||
| extern GridLogger GridLogMessage; | extern GridLogger GridLogMessage; | ||||||
| extern GridLogger GridLogDebug; | extern GridLogger GridLogDebug  ; | ||||||
| extern GridLogger GridLogPerformance; | extern GridLogger GridLogPerformance; | ||||||
| extern GridLogger GridLogDslash; | extern GridLogger GridLogDslash; | ||||||
| extern GridLogger GridLogIterative; | extern GridLogger GridLogIterative  ; | ||||||
| extern GridLogger GridLogIntegrator; | extern GridLogger GridLogIntegrator  ; | ||||||
| extern GridLogger GridLogHMC; | extern GridLogger GridLogHMC; | ||||||
| extern GridLogger GridLogMemory; | extern GridLogger GridLogMemory; | ||||||
| extern GridLogger GridLogTracing; | extern GridLogger GridLogTracing; | ||||||
| @@ -191,41 +191,6 @@ extern Colours    GridLogColours; | |||||||
|  |  | ||||||
| std::string demangle(const char* name) ; | std::string demangle(const char* name) ; | ||||||
|  |  | ||||||
| template<typename... Args> |  | ||||||
| inline std::string sjoin(Args&&... args) noexcept { |  | ||||||
|     std::ostringstream msg; |  | ||||||
|     (msg << ... << args); |  | ||||||
|     return msg.str(); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /*!  @brief make log messages work like python print */ |  | ||||||
| template <typename... Args> |  | ||||||
| inline void Grid_log(Args&&... args) { |  | ||||||
|     std::string msg = sjoin(std::forward<Args>(args)...); |  | ||||||
|     std::cout << GridLogMessage << msg << std::endl; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /*!  @brief make warning messages work like python print */ |  | ||||||
| template <typename... Args> |  | ||||||
| inline void Grid_warn(Args&&... args) { |  | ||||||
|     std::string msg = sjoin(std::forward<Args>(args)...); |  | ||||||
|     std::cout << "\033[33m" << GridLogWarning << msg << "\033[0m" << std::endl; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /*!  @brief make error messages work like python print */ |  | ||||||
| template <typename... Args> |  | ||||||
| inline void Grid_error(Args&&... args) { |  | ||||||
|     std::string msg = sjoin(std::forward<Args>(args)...); |  | ||||||
|     std::cout << "\033[31m" << GridLogError << msg << "\033[0m" << std::endl; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /*!  @brief make pass messages work like python print */ |  | ||||||
| template <typename... Args> |  | ||||||
| inline void Grid_pass(Args&&... args) { |  | ||||||
|     std::string msg = sjoin(std::forward<Args>(args)...); |  | ||||||
|     std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #define _NBACKTRACE (256) | #define _NBACKTRACE (256) | ||||||
| extern void * Grid_backtrace_buffer[_NBACKTRACE]; | extern void * Grid_backtrace_buffer[_NBACKTRACE]; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -34,7 +34,7 @@ class GridTracer { | |||||||
| }; | }; | ||||||
| inline void tracePush(const char *name) { roctxRangePushA(name); } | inline void tracePush(const char *name) { roctxRangePushA(name); } | ||||||
| inline void tracePop(const char *name) { roctxRangePop(); } | inline void tracePop(const char *name) { roctxRangePop(); } | ||||||
| inline int  traceStart(const char *name) { return roctxRangeStart(name); } | inline int  traceStart(const char *name) { roctxRangeStart(name); } | ||||||
| inline void traceStop(int ID) { roctxRangeStop(ID); } | inline void traceStop(int ID) { roctxRangeStop(ID); } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|   | |||||||
| @@ -129,22 +129,6 @@ public: | |||||||
|   virtual ~Action(){} |   virtual ~Action(){} | ||||||
| }; | }; | ||||||
|  |  | ||||||
| template <class GaugeField > |  | ||||||
| class EmptyAction : public Action <GaugeField> |  | ||||||
| { |  | ||||||
|   virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions |  | ||||||
|   virtual RealD S(const GaugeField& U) { return 0.0;};                             // evaluate the action |  | ||||||
|   virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); };        // evaluate the action derivative |  | ||||||
|  |  | ||||||
|   /////////////////////////////// |  | ||||||
|   // Logging |  | ||||||
|   /////////////////////////////// |  | ||||||
|   virtual std::string action_name()    { return std::string("Level Force Log"); }; |  | ||||||
|   virtual std::string LogParameters()  { return std::string("No parameters");}; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
| #endif // ACTION_BASE_H | #endif // ACTION_BASE_H | ||||||
|   | |||||||
| @@ -67,6 +67,7 @@ NAMESPACE_CHECK(Scalar); | |||||||
| #include <Grid/qcd/utils/Metric.h> | #include <Grid/qcd/utils/Metric.h> | ||||||
| NAMESPACE_CHECK(Metric); | NAMESPACE_CHECK(Metric); | ||||||
| #include <Grid/qcd/utils/CovariantLaplacian.h> | #include <Grid/qcd/utils/CovariantLaplacian.h> | ||||||
|  | #include <Grid/qcd/utils/CovariantLaplacianRat.h> | ||||||
| NAMESPACE_CHECK(CovariantLaplacian); | NAMESPACE_CHECK(CovariantLaplacian); | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -65,6 +65,19 @@ struct WilsonImplParams { | |||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | struct GaugeImplParams { | ||||||
|  | //  bool overlapCommsCompute; | ||||||
|  | //  AcceleratorVector<Real,Nd> twist_n_2pi_L; | ||||||
|  |   AcceleratorVector<Complex,Nd> boundary_phases; | ||||||
|  |   GaugeImplParams()  { | ||||||
|  |     boundary_phases.resize(Nd, 1.0); | ||||||
|  | //      twist_n_2pi_L.resize(Nd, 0.0); | ||||||
|  |   }; | ||||||
|  |   GaugeImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi) { | ||||||
|  | //    twist_n_2pi_L.resize(Nd, 0.0); | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
| struct StaggeredImplParams { | struct StaggeredImplParams { | ||||||
|   Coordinate dirichlet; // Blocksize of dirichlet BCs |   Coordinate dirichlet; // Blocksize of dirichlet BCs | ||||||
|   int  partialDirichlet; |   int  partialDirichlet; | ||||||
|   | |||||||
| @@ -63,9 +63,7 @@ public: | |||||||
|   virtual void MooeeDag(const FermionField &in, FermionField &out) ; |   virtual void MooeeDag(const FermionField &in, FermionField &out) ; | ||||||
|   virtual void MooeeInv(const FermionField &in, FermionField &out) ; |   virtual void MooeeInv(const FermionField &in, FermionField &out) ; | ||||||
|   virtual void MooeeInvDag(const FermionField &in, FermionField &out) ; |   virtual void MooeeInvDag(const FermionField &in, FermionField &out) ; | ||||||
|   virtual void M(const FermionField &in, FermionField &out) ; |  | ||||||
|   virtual void Mdag(const FermionField &in, FermionField &out) ; |  | ||||||
|    |  | ||||||
| private: | private: | ||||||
|   RealD mu; // TwistedMass parameter |   RealD mu; // TwistedMass parameter | ||||||
|  |  | ||||||
|   | |||||||
| @@ -280,16 +280,20 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, | |||||||
|  |  | ||||||
|   if( interior && exterior ) {  |   if( interior && exterior ) {  | ||||||
|     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,1); return;} |     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,1); return;} | ||||||
|     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;} |  | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
|  |     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;} | ||||||
|     if (Opt == OptInlineAsm  ) {  ASM_CALL(DhopSiteAsm);     return;} |     if (Opt == OptInlineAsm  ) {  ASM_CALL(DhopSiteAsm);     return;} | ||||||
| #endif | #endif | ||||||
|   } else if( interior ) { |   } else if( interior ) { | ||||||
|     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,1); return;} |     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,1); return;} | ||||||
|  | #ifndef GRID_CUDA | ||||||
|     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1);    return;} |     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1);    return;} | ||||||
|  | #endif | ||||||
|   } else if( exterior ) {  |   } else if( exterior ) {  | ||||||
|     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;} |     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;} | ||||||
|  | #ifndef GRID_CUDA | ||||||
|     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;} |     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;} | ||||||
|  | #endif | ||||||
|   } |   } | ||||||
|   assert(0 && " Kernel optimisation case not covered "); |   assert(0 && " Kernel optimisation case not covered "); | ||||||
| } | } | ||||||
| @@ -318,13 +322,19 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, | |||||||
|    |    | ||||||
|   if( interior && exterior ) {  |   if( interior && exterior ) {  | ||||||
|     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,0); return;} |     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,0); return;} | ||||||
|  | #ifndef GRID_CUDA | ||||||
|     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0);    return;} |     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0);    return;} | ||||||
|  | #endif | ||||||
|   } else if( interior ) { |   } else if( interior ) { | ||||||
|     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,0); return;} |     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,0); return;} | ||||||
|  | #ifndef GRID_CUDA | ||||||
|     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0);    return;} |     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0);    return;} | ||||||
|  | #endif | ||||||
|   } else if( exterior ) {  |   } else if( exterior ) {  | ||||||
|     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,0); return;} |     if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,0); return;} | ||||||
|  | #ifndef GRID_CUDA | ||||||
|     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0);    return;} |     if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0);    return;} | ||||||
|  | #endif | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -462,7 +462,6 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
|     autoView(st_v , st,AcceleratorRead); |     autoView(st_v , st,AcceleratorRead); | ||||||
|  |  | ||||||
|    if( interior && exterior ) { |    if( interior && exterior ) { | ||||||
|      acceleratorFenceComputeStream(); |  | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;} | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;} | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
| @@ -496,7 +495,6 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField | |||||||
|     autoView(st_v ,st,AcceleratorRead); |     autoView(st_v ,st,AcceleratorRead); | ||||||
|  |  | ||||||
|    if( interior && exterior ) { |    if( interior && exterior ) { | ||||||
|      acceleratorFenceComputeStream(); |  | ||||||
|      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;} |      if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSiteDag); return;} | ||||||
|      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;} |      if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag);    return;} | ||||||
| #ifndef GRID_CUDA | #ifndef GRID_CUDA | ||||||
|   | |||||||
| @@ -93,25 +93,5 @@ void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &ou | |||||||
|   RealD b    = tm /sq; |   RealD b    = tm /sq; | ||||||
|   axpibg5x(out,in,a,b); |   axpibg5x(out,in,a,b); | ||||||
| } | } | ||||||
| template<class Impl> |  | ||||||
| void WilsonTMFermion<Impl>::M(const FermionField &in, FermionField &out) { |  | ||||||
|   out.Checkerboard() = in.Checkerboard(); |  | ||||||
|   this->Dhop(in, out, DaggerNo); |  | ||||||
|   FermionField tmp(out.Grid()); |  | ||||||
|   RealD a = 4.0+this->mass; |  | ||||||
|   RealD b = this->mu; |  | ||||||
|   axpibg5x(tmp,in,a,b); |  | ||||||
|   axpy(out, 1.0, tmp, out); |  | ||||||
| } |  | ||||||
| template<class Impl> |  | ||||||
| void WilsonTMFermion<Impl>::Mdag(const FermionField &in, FermionField &out) { |  | ||||||
|   out.Checkerboard() = in.Checkerboard(); |  | ||||||
|   this->Dhop(in, out, DaggerYes); |  | ||||||
|   FermionField tmp(out.Grid()); |  | ||||||
|   RealD a = 4.0+this->mass; |  | ||||||
|   RealD b = -this->mu; |  | ||||||
|   axpibg5x(tmp,in,a,b); |  | ||||||
|   axpy(out, 1.0, tmp, out); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|   | |||||||
| @@ -32,7 +32,7 @@ directory | |||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
| #define CPS_MD_TIME | #undef CPS_MD_TIME | ||||||
|  |  | ||||||
| #ifdef CPS_MD_TIME | #ifdef CPS_MD_TIME | ||||||
| #define HMC_MOMENTUM_DENOMINATOR (2.0) | #define HMC_MOMENTUM_DENOMINATOR (2.0) | ||||||
|   | |||||||
| @@ -42,9 +42,13 @@ template <class Gimpl> | |||||||
| class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> { | class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> { | ||||||
| public:   | public:   | ||||||
|   INHERIT_GIMPL_TYPES(Gimpl); |   INHERIT_GIMPL_TYPES(Gimpl); | ||||||
|  |   typedef GaugeImplParams ImplParams; | ||||||
|  |   ImplParams Params; | ||||||
|  |  | ||||||
|   /////////////////////////// constructors |   /////////////////////////// constructors | ||||||
|   explicit WilsonGaugeAction(RealD beta_):beta(beta_){}; |   explicit WilsonGaugeAction(RealD beta_, | ||||||
|  | 		  const ImplParams &p = ImplParams() | ||||||
|  | 		  ):beta(beta_),Params(p){}; | ||||||
|  |  | ||||||
|   virtual std::string action_name() {return "WilsonGaugeAction";} |   virtual std::string action_name() {return "WilsonGaugeAction";} | ||||||
|  |  | ||||||
| @@ -56,14 +60,53 @@ public: | |||||||
|  |  | ||||||
|   virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){};  // noop as no pseudoferms |   virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){};  // noop as no pseudoferms | ||||||
|  |  | ||||||
|  | // Umu<->U maximally confusing | ||||||
|  |   virtual void boundary(const GaugeField &Umu, GaugeField &Ub){ | ||||||
|  |     typedef typename Simd::scalar_type scalar_type; | ||||||
|  |     assert(Params.boundary_phases.size() == Nd); | ||||||
|  |     GridBase *GaugeGrid=Umu.Grid(); | ||||||
|  |     GaugeLinkField U(GaugeGrid); | ||||||
|  |     GaugeLinkField tmp(GaugeGrid); | ||||||
|  |  | ||||||
|  |     Lattice<iScalar<vInteger> > coor(GaugeGrid); | ||||||
|  |     for (int mu = 0; mu < Nd; mu++) { | ||||||
|  | 	////////// boundary phase ///////////// | ||||||
|  |       auto pha = Params.boundary_phases[mu]; | ||||||
|  |       scalar_type phase( real(pha),imag(pha) ); | ||||||
|  |       std::cout<< GridLogIterative << "[WilsonGaugeAction] boundary "<<mu<<" "<<phase<< std::endl;  | ||||||
|  |  | ||||||
|  | 	int L   = GaugeGrid->GlobalDimensions()[mu]; | ||||||
|  |         int Lmu = L - 1; | ||||||
|  |  | ||||||
|  |       LatticeCoordinate(coor, mu); | ||||||
|  |  | ||||||
|  |       U = PeekIndex<LorentzIndex>(Umu, mu); | ||||||
|  |       tmp = where(coor == Lmu, phase * U, U); | ||||||
|  |       PokeIndex<LorentzIndex>(Ub, tmp, mu); | ||||||
|  | //      PokeIndex<LorentzIndex>(Ub, U, mu); | ||||||
|  | //      PokeIndex<LorentzIndex>(Umu, tmp, mu); | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|   virtual RealD S(const GaugeField &U) { |   virtual RealD S(const GaugeField &U) { | ||||||
|     RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U); |     GaugeField Ub(U.Grid()); | ||||||
|     RealD vol = U.Grid()->gSites(); |     this->boundary(U,Ub); | ||||||
|  |     static RealD lastG=0.; | ||||||
|  |     RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(Ub); | ||||||
|  |     RealD vol = Ub.Grid()->gSites(); | ||||||
|     RealD action = beta * (1.0 - plaq) * (Nd * (Nd - 1.0)) * vol * 0.5; |     RealD action = beta * (1.0 - plaq) * (Nd * (Nd - 1.0)) * vol * 0.5; | ||||||
|  |     std::cout << GridLogMessage << "[WilsonGaugeAction] dH: " << action-lastG << std::endl; | ||||||
|  |     RealD plaq_o = WilsonLoops<Gimpl>::avgPlaquette(U); | ||||||
|  |     RealD action_o = beta * (1.0 - plaq_o) * (Nd * (Nd - 1.0)) * vol * 0.5; | ||||||
|  |     std::cout << GridLogMessage << "[WilsonGaugeAction] U: " << action_o <<" Ub: "<< action  << std::endl; | ||||||
|  |     lastG=action; | ||||||
|     return action; |     return action; | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   virtual void deriv(const GaugeField &U, GaugeField &dSdU) { |   virtual void deriv(const GaugeField &U, GaugeField &dSdU) { | ||||||
|  |     GaugeField Ub(U.Grid()); | ||||||
|  |     this->boundary(U,Ub); | ||||||
|     // not optimal implementation FIXME |     // not optimal implementation FIXME | ||||||
|     // extend Ta to include Lorentz indexes |     // extend Ta to include Lorentz indexes | ||||||
|  |  | ||||||
| @@ -73,10 +116,9 @@ public: | |||||||
|     GaugeLinkField dSdU_mu(U.Grid()); |     GaugeLinkField dSdU_mu(U.Grid()); | ||||||
|     for (int mu = 0; mu < Nd; mu++) { |     for (int mu = 0; mu < Nd; mu++) { | ||||||
|  |  | ||||||
|       Umu = PeekIndex<LorentzIndex>(U, mu); |       Umu = PeekIndex<LorentzIndex>(Ub, mu); | ||||||
|        |  | ||||||
|       // Staple in direction mu |       // Staple in direction mu | ||||||
|       WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu); |       WilsonLoops<Gimpl>::Staple(dSdU_mu, Ub, mu); | ||||||
|       dSdU_mu = Ta(Umu * dSdU_mu) * factor; |       dSdU_mu = Ta(Umu * dSdU_mu) * factor; | ||||||
|        |        | ||||||
|       PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu); |       PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu); | ||||||
|   | |||||||
| @@ -178,7 +178,10 @@ NAMESPACE_BEGIN(Grid); | |||||||
|         // Use chronological inverter to forecast solutions across poles |         // Use chronological inverter to forecast solutions across poles | ||||||
|         std::vector<FermionField> prev_solns; |         std::vector<FermionField> prev_solns; | ||||||
|         if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); } |         if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); } | ||||||
|         ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast; | 	MdagMLinearOperator<AbstractEOFAFermion<Impl> ,FermionField> MdagML(Lop); | ||||||
|  | 	MdagMLinearOperator<AbstractEOFAFermion<Impl> ,FermionField> MdagMR(Rop); | ||||||
|  | //        ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast; | ||||||
|  | 	ChronoForecast<MdagMLinearOperator<AbstractEOFAFermion<Impl>, FermionField> , FermionField> Forecast; | ||||||
|  |  | ||||||
|         // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta |         // \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta | ||||||
|         RealD N(PowerNegHalf.norm); |         RealD N(PowerNegHalf.norm); | ||||||
| @@ -198,7 +201,7 @@ NAMESPACE_BEGIN(Grid); | |||||||
|           heatbathRefreshShiftCoefficients(0, -gamma_l); |           heatbathRefreshShiftCoefficients(0, -gamma_l); | ||||||
|           if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles |           if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles | ||||||
|             Lop.Mdag(CG_src, Forecast_src); |             Lop.Mdag(CG_src, Forecast_src); | ||||||
|             CG_soln = Forecast(Lop, Forecast_src, prev_solns); |             CG_soln = Forecast(MdagML, Forecast_src, prev_solns); | ||||||
|             SolverHBL(Lop, CG_src, CG_soln); |             SolverHBL(Lop, CG_src, CG_soln); | ||||||
|             prev_solns.push_back(CG_soln); |             prev_solns.push_back(CG_soln); | ||||||
|           } else { |           } else { | ||||||
| @@ -225,7 +228,7 @@ NAMESPACE_BEGIN(Grid); | |||||||
| 	  heatbathRefreshShiftCoefficients(1, -gamma_l*PowerNegHalf.poles[k]); | 	  heatbathRefreshShiftCoefficients(1, -gamma_l*PowerNegHalf.poles[k]); | ||||||
|           if(use_heatbath_forecasting){ |           if(use_heatbath_forecasting){ | ||||||
|             Rop.Mdag(CG_src, Forecast_src); |             Rop.Mdag(CG_src, Forecast_src); | ||||||
|             CG_soln = Forecast(Rop, Forecast_src, prev_solns); |             CG_soln = Forecast(MdagMR, Forecast_src, prev_solns); | ||||||
|             SolverHBR(Rop, CG_src, CG_soln); |             SolverHBR(Rop, CG_src, CG_soln); | ||||||
|             prev_solns.push_back(CG_soln); |             prev_solns.push_back(CG_soln); | ||||||
|           } else { |           } else { | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| #pragma once | #pragma once | ||||||
|  |  | ||||||
| #define CPS_MD_TIME  | #undef CPS_MD_TIME  | ||||||
|  |  | ||||||
| #ifdef CPS_MD_TIME | #ifdef CPS_MD_TIME | ||||||
| #define HMC_MOMENTUM_DENOMINATOR (2.0) | #define HMC_MOMENTUM_DENOMINATOR (2.0) | ||||||
|   | |||||||
| @@ -121,12 +121,19 @@ public: | |||||||
|  |  | ||||||
|   template <class SmearingPolicy> |   template <class SmearingPolicy> | ||||||
|   void Run(SmearingPolicy &S) { |   void Run(SmearingPolicy &S) { | ||||||
|     Runner(S); |     TrivialMetric<typename Implementation::Field> Mtr; | ||||||
|  |     Runner(S,Mtr); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   template <class SmearingPolicy, class Metric> | ||||||
|  |   void Run(SmearingPolicy &S, Metric &Mtr) { | ||||||
|  |     Runner(S,Mtr); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void Run(){ |   void Run(){ | ||||||
|     NoSmearing<Implementation> S; |     NoSmearing<Implementation> S; | ||||||
|     Runner(S); |     TrivialMetric<typename Implementation::Field> Mtr; | ||||||
|  |     Runner(S,Mtr); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   //Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U. |   //Use the checkpointer to initialize the RNGs and the gauge field, writing the resulting gauge field into U. | ||||||
| @@ -176,15 +183,15 @@ public: | |||||||
|   ////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
| private: | private: | ||||||
|   template <class SmearingPolicy> |   template <class SmearingPolicy, class Metric> | ||||||
|   void Runner(SmearingPolicy &Smearing) { |   void Runner(SmearingPolicy &Smearing, Metric &Mtr) { | ||||||
|     auto UGrid = Resources.GetCartesian(); |     auto UGrid = Resources.GetCartesian(); | ||||||
|     Field U(UGrid); |     Field U(UGrid); | ||||||
|  |  | ||||||
|     initializeGaugeFieldAndRNGs(U); |     initializeGaugeFieldAndRNGs(U); | ||||||
|  |  | ||||||
|     typedef IntegratorType<SmearingPolicy> TheIntegrator; |     typedef IntegratorType<SmearingPolicy> TheIntegrator; | ||||||
|     TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing); |     TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing,Mtr); | ||||||
|  |  | ||||||
|     // Sets the momentum filter |     // Sets the momentum filter | ||||||
|     MDynamics.setMomentumFilter(*(Resources.GetMomentumFilter())); |     MDynamics.setMomentumFilter(*(Resources.GetMomentumFilter())); | ||||||
|   | |||||||
| @@ -55,6 +55,8 @@ struct HMCparameters: Serializable { | |||||||
|                                   Integer, NoMetropolisUntil, |                                   Integer, NoMetropolisUntil, | ||||||
| 				  bool, PerformRandomShift, /* @brief Randomly shift the gauge configuration at the start of a trajectory */ | 				  bool, PerformRandomShift, /* @brief Randomly shift the gauge configuration at the start of a trajectory */ | ||||||
|                                   std::string, StartingType, |                                   std::string, StartingType, | ||||||
|  | 				  Integer, SW, | ||||||
|  |                                   RealD, Kappa, | ||||||
|                                   IntegratorParameters, MD) |                                   IntegratorParameters, MD) | ||||||
|  |  | ||||||
|   HMCparameters() { |   HMCparameters() { | ||||||
| @@ -110,6 +112,8 @@ private: | |||||||
|   IntegratorType &TheIntegrator; |   IntegratorType &TheIntegrator; | ||||||
|   ObsListType Observables; |   ObsListType Observables; | ||||||
|  |  | ||||||
|  |   int traj_num; | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////// | ||||||
|   // Metropolis step |   // Metropolis step | ||||||
|   ///////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////// | ||||||
| @@ -200,14 +204,14 @@ private: | |||||||
|  |  | ||||||
|     std::cout << GridLogMessage << "--------------------------------------------------\n"; |     std::cout << GridLogMessage << "--------------------------------------------------\n"; | ||||||
|     std::cout << GridLogMessage << " Molecular Dynamics evolution "; |     std::cout << GridLogMessage << " Molecular Dynamics evolution "; | ||||||
|     TheIntegrator.integrate(U); |     TheIntegrator.integrate(U,traj_num); | ||||||
|     std::cout << GridLogMessage << "--------------------------------------------------\n"; |     std::cout << GridLogMessage << "--------------------------------------------------\n"; | ||||||
|  |  | ||||||
|     ////////////////////////////////////////////////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|     // updated state action |     // updated state action | ||||||
|     ////////////////////////////////////////////////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|     std::cout << GridLogMessage << "--------------------------------------------------\n"; |     std::cout << GridLogMessage << "--------------------------------------------------\n"; | ||||||
|     std::cout << GridLogMessage << "Compute final action"; |     std::cout << GridLogMessage << "Compute final action" <<std::endl; | ||||||
|     RealD H1 = TheIntegrator.S(U);   |     RealD H1 = TheIntegrator.S(U);   | ||||||
|     std::cout << GridLogMessage << "--------------------------------------------------\n"; |     std::cout << GridLogMessage << "--------------------------------------------------\n"; | ||||||
|  |  | ||||||
| @@ -242,7 +246,7 @@ public: | |||||||
|   HybridMonteCarlo(HMCparameters _Pams, IntegratorType &_Int, |   HybridMonteCarlo(HMCparameters _Pams, IntegratorType &_Int, | ||||||
|                    GridSerialRNG &_sRNG, GridParallelRNG &_pRNG,  |                    GridSerialRNG &_sRNG, GridParallelRNG &_pRNG,  | ||||||
|                    ObsListType _Obs, Field &_U) |                    ObsListType _Obs, Field &_U) | ||||||
|     : Params(_Pams), TheIntegrator(_Int), sRNG(_sRNG), pRNG(_pRNG), Observables(_Obs), Ucur(_U) {} |     : Params(_Pams), TheIntegrator(_Int), sRNG(_sRNG), pRNG(_pRNG), Observables(_Obs), Ucur(_U),traj_num(0) {} | ||||||
|   ~HybridMonteCarlo(){}; |   ~HybridMonteCarlo(){}; | ||||||
|  |  | ||||||
|   void evolve(void) { |   void evolve(void) { | ||||||
| @@ -257,9 +261,10 @@ public: | |||||||
|     unsigned int FinalTrajectory = Params.Trajectories + Params.NoMetropolisUntil + Params.StartTrajectory; |     unsigned int FinalTrajectory = Params.Trajectories + Params.NoMetropolisUntil + Params.StartTrajectory; | ||||||
|  |  | ||||||
|     for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) { |     for (int traj = Params.StartTrajectory; traj < FinalTrajectory; ++traj) { | ||||||
|  |      | ||||||
|  |  | ||||||
|       std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n"; |       std::cout << GridLogHMC << "-- # Trajectory = " << traj << "\n"; | ||||||
|  |       traj_num=traj; | ||||||
|       if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) { |       if (traj < Params.StartTrajectory + Params.NoMetropolisUntil) { | ||||||
|       	std::cout << GridLogHMC << "-- Thermalization" << std::endl; |       	std::cout << GridLogHMC << "-- Thermalization" << std::endl; | ||||||
|       } |       } | ||||||
|   | |||||||
| @@ -9,6 +9,7 @@ Copyright (C) 2015 | |||||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
| Author: Guido Cossu <cossu@post.kek.jp> | Author: Guido Cossu <cossu@post.kek.jp> | ||||||
|  | Author: Chulwoo Jung <chulwoo@bnl.gov> | ||||||
|  |  | ||||||
| This program is free software; you can redistribute it and/or modify | This program is free software; you can redistribute it and/or modify | ||||||
| it under the terms of the GNU General Public License as published by | it under the terms of the GNU General Public License as published by | ||||||
| @@ -33,6 +34,7 @@ directory | |||||||
| #define INTEGRATOR_INCLUDED | #define INTEGRATOR_INCLUDED | ||||||
|  |  | ||||||
| #include <memory> | #include <memory> | ||||||
|  | #include <Grid/parallelIO/NerscIO.h> | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
| @@ -41,10 +43,19 @@ public: | |||||||
|   GRID_SERIALIZABLE_CLASS_MEMBERS(IntegratorParameters, |   GRID_SERIALIZABLE_CLASS_MEMBERS(IntegratorParameters, | ||||||
| 				  std::string, name,      // name of the integrator | 				  std::string, name,      // name of the integrator | ||||||
| 				  unsigned int, MDsteps,  // number of outer steps | 				  unsigned int, MDsteps,  // number of outer steps | ||||||
|  | 				  RealD, RMHMCTol, | ||||||
|  |                                   RealD, RMHMCCGTol, | ||||||
|  |                                   RealD, lambda0, | ||||||
|  |                                   RealD, lambda1, | ||||||
|  |                                   RealD, lambda2, | ||||||
| 				  RealD, trajL)           // trajectory length | 				  RealD, trajL)           // trajectory length | ||||||
|  |  | ||||||
|   IntegratorParameters(int MDsteps_ = 10, RealD trajL_ = 1.0) |   IntegratorParameters(int MDsteps_ = 10, RealD trajL_ = 1.0) | ||||||
|   : MDsteps(MDsteps_), |   : MDsteps(MDsteps_), | ||||||
|  |    lambda0(0.1931833275037836), | ||||||
|  |    lambda1(0.1931833275037836), | ||||||
|  |    lambda2(0.1931833275037836), | ||||||
|  |    RMHMCTol(1e-8),RMHMCCGTol(1e-8), | ||||||
|     trajL(trajL_) {}; |     trajL(trajL_) {}; | ||||||
|  |  | ||||||
|   template <class ReaderClass, typename std::enable_if<isReader<ReaderClass>::value, int >::type = 0 > |   template <class ReaderClass, typename std::enable_if<isReader<ReaderClass>::value, int >::type = 0 > | ||||||
| @@ -75,11 +86,14 @@ public: | |||||||
|   double t_U;  // Track time passing on each level and for U and for P |   double t_U;  // Track time passing on each level and for U and for P | ||||||
|   std::vector<double> t_P;   |   std::vector<double> t_P;   | ||||||
|  |  | ||||||
|   MomentaField P; | //  MomentaField P; | ||||||
|  |   GeneralisedMomenta<FieldImplementation > P; | ||||||
|   SmearingPolicy& Smearer; |   SmearingPolicy& Smearer; | ||||||
|   RepresentationPolicy Representations; |   RepresentationPolicy Representations; | ||||||
|   IntegratorParameters Params; |   IntegratorParameters Params; | ||||||
|  |  | ||||||
|  |   RealD Saux,Smom,Sg; | ||||||
|  |  | ||||||
|   //Filters allow the user to manipulate the conjugate momentum, for example to freeze links in DDHMC |   //Filters allow the user to manipulate the conjugate momentum, for example to freeze links in DDHMC | ||||||
|   //It is applied whenever the momentum is updated / refreshed |   //It is applied whenever the momentum is updated / refreshed | ||||||
|   //The default filter does nothing |   //The default filter does nothing | ||||||
| @@ -87,8 +101,6 @@ public: | |||||||
|  |  | ||||||
|   const ActionSet<Field, RepresentationPolicy> as; |   const ActionSet<Field, RepresentationPolicy> as; | ||||||
|  |  | ||||||
|   ActionSet<Field,RepresentationPolicy> LevelForces; |  | ||||||
|    |  | ||||||
|   //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default |   //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default | ||||||
|   static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){  |   static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){  | ||||||
|     static MomentumFilterNone<MomentaField> filter; |     static MomentumFilterNone<MomentaField> filter; | ||||||
| @@ -98,7 +110,16 @@ public: | |||||||
|   void update_P(Field& U, int level, double ep)  |   void update_P(Field& U, int level, double ep)  | ||||||
|   { |   { | ||||||
|     t_P[level] += ep; |     t_P[level] += ep; | ||||||
|     update_P(P, U, level, ep); |     update_P(P.Mom, U, level, ep); | ||||||
|  |  | ||||||
|  |     std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void update_P2(Field& U, int level, double ep)  | ||||||
|  |   { | ||||||
|  |     t_P[level] += ep; | ||||||
|  |     update_P2(P.Mom, U, level, ep); | ||||||
|  |  | ||||||
|     std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl; |     std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -121,78 +142,174 @@ public: | |||||||
|     } |     } | ||||||
|   } update_P_hireps{}; |   } update_P_hireps{}; | ||||||
|  |  | ||||||
|   |  | ||||||
|   void update_P(MomentaField& Mom, Field& U, int level, double ep) { |   void update_P(MomentaField& Mom, Field& U, int level, double ep) { | ||||||
|     // input U actually not used in the fundamental case |     // input U actually not used in the fundamental case | ||||||
|     // Fundamental updates, include smearing |     // Fundamental updates, include smearing | ||||||
|  |  | ||||||
|     assert(as.size()==LevelForces.size()); |  | ||||||
|      |  | ||||||
|     Field level_force(U.Grid()); level_force =Zero(); |  | ||||||
|     for (int a = 0; a < as[level].actions.size(); ++a) { |     for (int a = 0; a < as[level].actions.size(); ++a) { | ||||||
|  |  | ||||||
|       double start_full = usecond(); |       double start_full = usecond(); | ||||||
|       Field force(U.Grid()); |       Field force(U.Grid()); | ||||||
|       conformable(U.Grid(), Mom.Grid()); |       conformable(U.Grid(), Mom.Grid()); | ||||||
|  |  | ||||||
|  |       Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared); | ||||||
|       double start_force = usecond(); |       double start_force = usecond(); | ||||||
|  |       as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta | ||||||
|  |  | ||||||
|       as[level].actions.at(a)->deriv_timer_start(); |       std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl; | ||||||
|       as[level].actions.at(a)->deriv(Smearer, force);  // deriv should NOT include Ta |       if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force); | ||||||
|       as[level].actions.at(a)->deriv_timer_stop(); |  | ||||||
|  |  | ||||||
|       auto name = as[level].actions.at(a)->action_name(); |  | ||||||
|  |  | ||||||
|       force = FieldImplementation::projectForce(force); // Ta for gauge fields |       force = FieldImplementation::projectForce(force); // Ta for gauge fields | ||||||
|       double end_force = usecond(); |       double end_force = usecond(); | ||||||
|        |       Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); | ||||||
|       MomFilter->applyFilter(force); |       std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl; | ||||||
|  |  | ||||||
|       std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<<  std::endl; |  | ||||||
|  |  | ||||||
|       // track the total |  | ||||||
|       level_force = level_force+force; |  | ||||||
|  |  | ||||||
|       Real force_abs   = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x])  |  | ||||||
|       Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;     |  | ||||||
|  |  | ||||||
|       Real force_max   = std::sqrt(maxLocalNorm2(force)); |  | ||||||
|       Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;     |  | ||||||
|  |  | ||||||
|       as[level].actions.at(a)->deriv_log(force_abs,force_max,impulse_abs,impulse_max); |  | ||||||
|        |  | ||||||
|       std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] dt           : " << ep <<" "<<name<<std::endl; |  | ||||||
|       std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force average: " << force_abs <<" "<<name<<std::endl; |  | ||||||
|       std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Force max    : " << force_max <<" "<<name<<std::endl; |  | ||||||
|       std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt average  : " << impulse_abs <<" "<<name<<std::endl; |  | ||||||
|       std::cout << GridLogIntegrator<< "["<<level<<"]["<<a<<"] Fdt max      : " << impulse_max <<" "<<name<<std::endl; |  | ||||||
|  |  | ||||||
|       Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;;  |       Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;;  | ||||||
|       double end_full = usecond(); |       double end_full = usecond(); | ||||||
|       double time_full  = (end_full - start_full) / 1e3; |       double time_full  = (end_full - start_full) / 1e3; | ||||||
|       double time_force = (end_force - start_force) / 1e3; |       double time_force = (end_force - start_force) / 1e3; | ||||||
|       std::cout << GridLogMessage << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl; |       std::cout << GridLogMessage << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl; | ||||||
|  |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     { |  | ||||||
|       // total force |  | ||||||
|       Real force_abs   = std::sqrt(norm2(level_force)/U.Grid()->gSites()); //average per-site norm.  nb. norm2(latt) = \sum_x norm2(latt[x])  |  | ||||||
|       Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;     |  | ||||||
|  |  | ||||||
|       Real force_max   = std::sqrt(maxLocalNorm2(level_force)); |  | ||||||
|       Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;     |  | ||||||
|       LevelForces[level].actions.at(0)->deriv_log(force_abs,force_max,impulse_abs,impulse_max); |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Force from the other representations |     // Force from the other representations | ||||||
|     as[level].apply(update_P_hireps, Representations, Mom, U, ep); |     as[level].apply(update_P_hireps, Representations, Mom, U, ep); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void update_P2(MomentaField& Mom, Field& U, int level, double ep) { | ||||||
|  |     // input U actually not used in the fundamental case | ||||||
|  |     // Fundamental updates, include smearing | ||||||
|  |  | ||||||
|  |     std::cout << GridLogIntegrator << "U before update_P2: " << std::sqrt(norm2(U)) << std::endl; | ||||||
|  |     // Generalised momenta   | ||||||
|  |     // Derivative of the kinetic term must be computed before | ||||||
|  |     // Mom is the momenta and gets updated by the  | ||||||
|  |     // actions derivatives | ||||||
|  |     MomentaField MomDer(P.Mom.Grid()); | ||||||
|  |     P.M.ImportGauge(U); | ||||||
|  |     P.DerivativeU(P.Mom, MomDer); | ||||||
|  |     std::cout << GridLogIntegrator << "MomDer update_P2: " << std::sqrt(norm2(MomDer)) << std::endl; | ||||||
|  | //    Mom -= MomDer * ep; | ||||||
|  |     Mom -= MomDer * ep * HMC_MOMENTUM_DENOMINATOR; | ||||||
|  |     std::cout << GridLogIntegrator << "Mom update_P2: " << std::sqrt(norm2(Mom)) << std::endl; | ||||||
|  |  | ||||||
|  |     // Auxiliary fields | ||||||
|  |     P.update_auxiliary_momenta(ep*0.5 ); | ||||||
|  |     P.AuxiliaryFieldsDerivative(MomDer); | ||||||
|  |     std::cout << GridLogIntegrator << "MomDer(Aux) update_P2: " << std::sqrt(norm2(Mom)) << std::endl; | ||||||
|  | //    Mom -= MomDer * ep; | ||||||
|  |     Mom -= MomDer * ep * HMC_MOMENTUM_DENOMINATOR; | ||||||
|  |     P.update_auxiliary_momenta(ep*0.5 ); | ||||||
|  |  | ||||||
|  |     for (int a = 0; a < as[level].actions.size(); ++a) { | ||||||
|  |       double start_full = usecond(); | ||||||
|  |       Field force(U.Grid()); | ||||||
|  |       conformable(U.Grid(), Mom.Grid()); | ||||||
|  |  | ||||||
|  |       Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared); | ||||||
|  |       double start_force = usecond(); | ||||||
|  |       as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta | ||||||
|  |  | ||||||
|  |       std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl; | ||||||
|  |       if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force); | ||||||
|  |       force = FieldImplementation::projectForce(force); // Ta for gauge fields | ||||||
|  |       double end_force = usecond(); | ||||||
|  |       Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); | ||||||
|  |       std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl; | ||||||
|  |       Mom -= force * ep* HMC_MOMENTUM_DENOMINATOR;;  | ||||||
|  |       double end_full = usecond(); | ||||||
|  |       double time_full  = (end_full - start_full) / 1e3; | ||||||
|  |       double time_force = (end_force - start_force) / 1e3; | ||||||
|  |       std::cout << GridLogMessage << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)"  << std::endl; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Force from the other representations | ||||||
|  |     as[level].apply(update_P_hireps, Representations, Mom, U, ep); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void implicit_update_P(Field& U, int level, double ep, double ep1, bool intermediate = false) { | ||||||
|  |     t_P[level] += ep; | ||||||
|  |  | ||||||
|  |     double ep2= ep-ep1; | ||||||
|  |  | ||||||
|  |     std::cout << GridLogIntegrator << "[" << level << "] P " | ||||||
|  |               << " dt " << ep << " : t_P " << t_P[level] << std::endl; | ||||||
|  |     std::cout << GridLogIntegrator << "U before implicit_update_P: " << std::sqrt(norm2(U)) << std::endl; | ||||||
|  |     // Fundamental updates, include smearing | ||||||
|  |     MomentaField Msum(P.Mom.Grid()); | ||||||
|  |     Msum = Zero(); | ||||||
|  |     for (int a = 0; a < as[level].actions.size(); ++a) { | ||||||
|  |       // Compute the force terms for the lagrangian part | ||||||
|  |       // We need to compute the derivative of the actions | ||||||
|  |       // only once | ||||||
|  |       Field force(U.Grid()); | ||||||
|  |       conformable(U.Grid(), P.Mom.Grid()); | ||||||
|  |       Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared); | ||||||
|  |       as[level].actions.at(a)->deriv(Us, force);  // deriv should NOT include Ta | ||||||
|  |  | ||||||
|  |       std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl; | ||||||
|  |       if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force); | ||||||
|  |       force = FieldImplementation::projectForce(force);  // Ta for gauge fields | ||||||
|  |       Real force_abs = std::sqrt(norm2(force) / U.Grid()->gSites()); | ||||||
|  |       std::cout << GridLogIntegrator << "|Force| site average: " << force_abs | ||||||
|  |                 << std::endl; | ||||||
|  |       Msum += force; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     MomentaField NewMom = P.Mom; | ||||||
|  |     MomentaField OldMom = P.Mom; | ||||||
|  |     double threshold = Params.RMHMCTol; | ||||||
|  |     P.M.ImportGauge(U); | ||||||
|  |     MomentaField MomDer(P.Mom.Grid()); | ||||||
|  |     MomentaField MomDer1(P.Mom.Grid()); | ||||||
|  |     MomentaField AuxDer(P.Mom.Grid()); | ||||||
|  |     MomDer1 = Zero(); | ||||||
|  |     MomentaField diff(P.Mom.Grid()); | ||||||
|  |     double factor = 2.0; | ||||||
|  |     if (intermediate){ | ||||||
|  |       P.DerivativeU(P.Mom, MomDer1); | ||||||
|  |       factor = 1.0; | ||||||
|  |     } | ||||||
|  | //    std::cout << GridLogIntegrator << "MomDer1 implicit_update_P: " << std::sqrt(norm2(MomDer1)) << std::endl; | ||||||
|  |  | ||||||
|  |     // Auxiliary fields | ||||||
|  |     P.update_auxiliary_momenta(ep1); | ||||||
|  |     P.AuxiliaryFieldsDerivative(AuxDer); | ||||||
|  |     Msum += AuxDer; | ||||||
|  |      | ||||||
|  |  | ||||||
|  |     // Here run recursively | ||||||
|  |     int counter = 1; | ||||||
|  |     RealD RelativeError; | ||||||
|  |     do { | ||||||
|  |       std::cout << GridLogIntegrator << "UpdateP implicit step "<< counter << std::endl; | ||||||
|  |  | ||||||
|  |       // Compute the derivative of the kinetic term | ||||||
|  |       // with respect to the gauge field | ||||||
|  |       P.DerivativeU(NewMom, MomDer); | ||||||
|  |       Real force_abs = std::sqrt(norm2(MomDer) / U.Grid()->gSites()); | ||||||
|  |       std::cout << GridLogIntegrator << "|Force| laplacian site average: " << force_abs | ||||||
|  |                 << std::endl; | ||||||
|  |  | ||||||
|  | //      NewMom = P.Mom - ep* 0.5 * HMC_MOMENTUM_DENOMINATOR * (2.0*Msum + factor*MomDer + MomDer1);// simplify | ||||||
|  |       NewMom = P.Mom -  HMC_MOMENTUM_DENOMINATOR * (ep*Msum + ep1* factor*MomDer + ep2* MomDer1);// simplify | ||||||
|  |       diff = NewMom - OldMom; | ||||||
|  |       counter++; | ||||||
|  |       RelativeError = std::sqrt(norm2(diff))/std::sqrt(norm2(NewMom)); | ||||||
|  |       std::cout << GridLogIntegrator << "UpdateP RelativeError: " << RelativeError << std::endl; | ||||||
|  |       OldMom = NewMom; | ||||||
|  |     } while (RelativeError > threshold); | ||||||
|  |  | ||||||
|  |     P.Mom = NewMom; | ||||||
|  |     std::cout << GridLogIntegrator << "NewMom implicit_update_P: " << std::sqrt(norm2(NewMom)) << std::endl; | ||||||
|  |  | ||||||
|  |     // update the auxiliary fields momenta     | ||||||
|  |     P.update_auxiliary_momenta(ep2); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void implicit_update_P(Field& U, int level, double ep, bool intermediate = false) { | ||||||
|  |       implicit_update_P( U, level, ep, ep*0.5, intermediate );  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void update_U(Field& U, double ep)  |   void update_U(Field& U, double ep)  | ||||||
|   { |   { | ||||||
|     update_U(P, U, ep); |     update_U(P.Mom, U, ep); | ||||||
|  |  | ||||||
|     t_U += ep; |     t_U += ep; | ||||||
|     int fl = levels - 1; |     int fl = levels - 1; | ||||||
| @@ -201,12 +318,8 @@ public: | |||||||
|    |    | ||||||
|   void update_U(MomentaField& Mom, Field& U, double ep)  |   void update_U(MomentaField& Mom, Field& U, double ep)  | ||||||
|   { |   { | ||||||
|     MomentaField MomFiltered(Mom.Grid()); |  | ||||||
|     MomFiltered = Mom; |  | ||||||
|     MomFilter->applyFilter(MomFiltered); |  | ||||||
|  |  | ||||||
|     // exponential of Mom*U in the gauge fields case |     // exponential of Mom*U in the gauge fields case | ||||||
|     FieldImplementation::update_field(MomFiltered, U, ep); |     FieldImplementation::update_field(Mom, U, ep); | ||||||
|  |  | ||||||
|     // Update the smeared fields, can be implemented as observer |     // Update the smeared fields, can be implemented as observer | ||||||
|     Smearer.set_Field(U); |     Smearer.set_Field(U); | ||||||
| @@ -215,18 +328,74 @@ public: | |||||||
|     Representations.update(U);  // void functions if fundamental representation |     Representations.update(U);  // void functions if fundamental representation | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   void implicit_update_U(Field&U, double ep, double ep1 ){ | ||||||
|  |     double ep2=ep-ep1; | ||||||
|  |     t_U += ep; | ||||||
|  |     int fl = levels - 1; | ||||||
|  |     std::cout << GridLogIntegrator << "   " << "[" << fl << "] U " << " dt " << ep << " : t_U " << t_U << std::endl; | ||||||
|  |     std::cout << GridLogIntegrator << "U before implicit_update_U: " << std::sqrt(norm2(U)) << std::endl; | ||||||
|  |  | ||||||
|  |     MomentaField Mom1(P.Mom.Grid()); | ||||||
|  |     MomentaField Mom2(P.Mom.Grid()); | ||||||
|  |     RealD RelativeError; | ||||||
|  |     Field diff(U.Grid()); | ||||||
|  |     Real threshold =  Params.RMHMCTol; | ||||||
|  |     int counter = 1; | ||||||
|  |     int MaxCounter = 100; | ||||||
|  |  | ||||||
|  |     Field OldU = U; | ||||||
|  |     Field NewU = U; | ||||||
|  |  | ||||||
|  |     P.M.ImportGauge(U); | ||||||
|  |     P.DerivativeP(Mom1); // first term in the derivative  | ||||||
|  |     std::cout << GridLogIntegrator << "implicit_update_U: Mom1: " << std::sqrt(norm2(Mom1)) << std::endl; | ||||||
|  |  | ||||||
|  |     P.update_auxiliary_fields(ep1); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     MomentaField sum=Mom1; | ||||||
|  |     do { | ||||||
|  |       std::cout << GridLogIntegrator << "UpdateU implicit step "<< counter << std::endl; | ||||||
|  |        | ||||||
|  |       P.DerivativeP(Mom2); // second term in the derivative, on the updated U | ||||||
|  |       std::cout << GridLogIntegrator << "implicit_update_U: Mom1: " << std::sqrt(norm2(Mom1)) << std::endl; | ||||||
|  |       sum = (Mom1*ep1 + Mom2*ep2); | ||||||
|  |  | ||||||
|  |       for (int mu = 0; mu < Nd; mu++) { | ||||||
|  |         auto Umu = PeekIndex<LorentzIndex>(U, mu); | ||||||
|  |         auto Pmu = PeekIndex<LorentzIndex>(sum, mu); | ||||||
|  |         Umu = expMat(Pmu, 1, 12) * Umu; | ||||||
|  |         PokeIndex<LorentzIndex>(NewU, ProjectOnGroup(Umu), mu); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       diff = NewU - OldU; | ||||||
|  |       RelativeError = std::sqrt(norm2(diff))/std::sqrt(norm2(NewU)); | ||||||
|  |       std::cout << GridLogIntegrator << "UpdateU RelativeError: " << RelativeError << std::endl; | ||||||
|  |        | ||||||
|  |       P.M.ImportGauge(NewU); | ||||||
|  |       OldU = NewU; // some redundancy to be eliminated | ||||||
|  |       counter++; | ||||||
|  |     } while (RelativeError > threshold && counter < MaxCounter); | ||||||
|  |  | ||||||
|  |     U = NewU; | ||||||
|  |     std::cout << GridLogIntegrator << "NewU implicit_update_U: " << std::sqrt(norm2(U)) << std::endl; | ||||||
|  |     P.update_auxiliary_fields(ep2); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   virtual void step(Field& U, int level, int first, int last) = 0; |   virtual void step(Field& U, int level, int first, int last) = 0; | ||||||
|  |  | ||||||
| public: | public: | ||||||
|   Integrator(GridBase* grid, IntegratorParameters Par, |   Integrator(GridBase* grid, IntegratorParameters Par, | ||||||
|              ActionSet<Field, RepresentationPolicy>& Aset, |              ActionSet<Field, RepresentationPolicy>& Aset, | ||||||
|              SmearingPolicy& Sm) |              SmearingPolicy& Sm, Metric<MomentaField>& M) | ||||||
|     : Params(Par), |     : Params(Par), | ||||||
|       as(Aset), |       as(Aset), | ||||||
|       P(grid), |       P(grid, M), | ||||||
|       levels(Aset.size()), |       levels(Aset.size()), | ||||||
|       Smearer(Sm), |       Smearer(Sm), | ||||||
|       Representations(grid)  |       Representations(grid), | ||||||
|  |       Saux(0.),Smom(0.),Sg(0.) | ||||||
|   { |   { | ||||||
|     t_P.resize(levels, 0.0); |     t_P.resize(levels, 0.0); | ||||||
|     t_U = 0.0; |     t_U = 0.0; | ||||||
| @@ -234,16 +403,6 @@ public: | |||||||
|  |  | ||||||
|     //Default the momentum filter to "do-nothing" |     //Default the momentum filter to "do-nothing" | ||||||
|     MomFilter = getDefaultMomFilter(); |     MomFilter = getDefaultMomFilter(); | ||||||
|  |  | ||||||
|     for (int level = 0; level < as.size(); ++level) { |  | ||||||
|       int multiplier = as.at(level).multiplier; |  | ||||||
|       ActionLevel<Field, RepresentationPolicy> * Level = new ActionLevel<Field, RepresentationPolicy>(multiplier); |  | ||||||
|       Level->push_back(new EmptyAction<Field>);  |  | ||||||
|       LevelForces.push_back(*Level); |  | ||||||
|       // does it copy by value or reference?? |  | ||||||
|       // - answer it copies by value, BUT the action level contains a reference that is NOT updated. |  | ||||||
|       // Unsafe code in Guido's area |  | ||||||
|     } |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   virtual ~Integrator() {} |   virtual ~Integrator() {} | ||||||
| @@ -261,14 +420,10 @@ public: | |||||||
|  |  | ||||||
|   void reset_timer(void) |   void reset_timer(void) | ||||||
|   { |   { | ||||||
|     assert(as.size()==LevelForces.size()); |  | ||||||
|     for (int level = 0; level < as.size(); ++level) { |     for (int level = 0; level < as.size(); ++level) { | ||||||
|       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { |       for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { | ||||||
|         as[level].actions.at(actionID)->reset_timer(); |         as[level].actions.at(actionID)->reset_timer(); | ||||||
|       } |       } | ||||||
|       int actionID=0; |  | ||||||
|       assert(LevelForces.at(level).actions.size()==1); |  | ||||||
|       LevelForces.at(level).actions.at(actionID)->reset_timer(); |  | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   void print_timer(void) |   void print_timer(void) | ||||||
| @@ -330,16 +485,6 @@ public: | |||||||
| 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num | 		  <<" calls "     << as[level].actions.at(actionID)->deriv_num | ||||||
| 		  << std::endl; | 		  << std::endl; | ||||||
|       } |       } | ||||||
|       int actionID=0; |  | ||||||
|       std::cout << GridLogMessage  |  | ||||||
| 		  << LevelForces[level].actions.at(actionID)->action_name() |  | ||||||
| 		  <<"["<<level<<"]["<< actionID<<"] :\n\t\t " |  | ||||||
| 		  <<" force max " << LevelForces[level].actions.at(actionID)->deriv_max_average() |  | ||||||
| 		  <<" norm "      << LevelForces[level].actions.at(actionID)->deriv_norm_average() |  | ||||||
| 		  <<" Fdt max  "  << LevelForces[level].actions.at(actionID)->Fdt_max_average() |  | ||||||
| 		  <<" Fdt norm "  << LevelForces[level].actions.at(actionID)->Fdt_norm_average() |  | ||||||
| 		  <<" calls "     << LevelForces[level].actions.at(actionID)->deriv_num |  | ||||||
| 		  << std::endl; |  | ||||||
|     } |     } | ||||||
|     std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; |     std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; | ||||||
|   } |   } | ||||||
| @@ -361,19 +506,13 @@ public: | |||||||
| 	std::cout << as[level].actions.at(actionID)->LogParameters(); | 	std::cout << as[level].actions.at(actionID)->LogParameters(); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     std::cout << " [Integrator] Total Force loggers: "<< LevelForces.size() <<std::endl; |  | ||||||
|     for (int level = 0; level < LevelForces.size(); ++level) { |  | ||||||
|       std::cout << GridLogMessage << "[Integrator] ---- Level: "<< level << std::endl; |  | ||||||
|       for (int actionID = 0; actionID < LevelForces[level].actions.size(); ++actionID) { |  | ||||||
| 	std::cout << GridLogMessage << "["<< LevelForces[level].actions.at(actionID)->action_name() << "] ID: " << actionID << std::endl; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; |     std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void reverse_momenta() |   void reverse_momenta() | ||||||
|   { |   { | ||||||
|     P *= -1.0; |     P.Mom *= -1.0; | ||||||
|  |     P.AuxMom *= -1.0; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   // to be used by the actionlevel class to iterate |   // to be used by the actionlevel class to iterate | ||||||
| @@ -392,11 +531,14 @@ public: | |||||||
|   // Initialization of momenta and actions |   // Initialization of momenta and actions | ||||||
|   void refresh(Field& U,  GridSerialRNG & sRNG, GridParallelRNG& pRNG)  |   void refresh(Field& U,  GridSerialRNG & sRNG, GridParallelRNG& pRNG)  | ||||||
|   { |   { | ||||||
|     assert(P.Grid() == U.Grid()); |     assert(P.Mom.Grid() == U.Grid()); | ||||||
|     std::cout << GridLogIntegrator << "Integrator refresh" << std::endl; |     std::cout << GridLogIntegrator << "Integrator refresh" << std::endl; | ||||||
|  |  | ||||||
|     std::cout << GridLogIntegrator << "Generating momentum" << std::endl; |     std::cout << GridLogIntegrator << "Generating momentum" << std::endl; | ||||||
|     FieldImplementation::generate_momenta(P, sRNG, pRNG); | //    FieldImplementation::generate_momenta(P.Mom, sRNG, pRNG); | ||||||
|  |     P.M.ImportGauge(U); | ||||||
|  |     P.MomentaDistribution(sRNG,pRNG); | ||||||
|  |  | ||||||
|  |  | ||||||
|     // Update the smeared fields, can be implemented as observer |     // Update the smeared fields, can be implemented as observer | ||||||
|     // necessary to keep the fields updated even after a reject |     // necessary to keep the fields updated even after a reject | ||||||
| @@ -449,12 +591,24 @@ public: | |||||||
|   RealD S(Field& U)  |   RealD S(Field& U)  | ||||||
|   {  // here also U not used |   {  // here also U not used | ||||||
|  |  | ||||||
|     assert(as.size()==LevelForces.size()); |  | ||||||
|     std::cout << GridLogIntegrator << "Integrator action\n"; |     std::cout << GridLogIntegrator << "Integrator action\n"; | ||||||
|  |  | ||||||
|     RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom | //    RealD H = - FieldImplementation::FieldSquareNorm(P.Mom)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom | ||||||
|  | //    RealD Hterm; | ||||||
|  |  | ||||||
|  | //    static RealD Saux=0.,Smom=0.,Sg=0.; | ||||||
|  |  | ||||||
|  |     RealD H = - FieldImplementation::FieldSquareNorm(P.Mom)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom | ||||||
|  |     std::cout << GridLogMessage << "S:FieldSquareNorm H_p = " << H << "\n"; | ||||||
|  |     std::cout << GridLogMessage << "S:dSField = " << H-Smom << "\n"; | ||||||
|  |     Smom=H; | ||||||
|  |     P.M.ImportGauge(U); | ||||||
|  |     RealD Hterm = - P.MomentaAction(); | ||||||
|  |     std::cout << GridLogMessage << "S:Momentum action H_p = " << Hterm << "\n"; | ||||||
|  |     std::cout << GridLogMessage << "S:dSMom = " << Hterm-Saux << "\n"; | ||||||
|  |     Saux=Hterm; | ||||||
|  |     H = Hterm; | ||||||
|  |  | ||||||
|     RealD Hterm; |  | ||||||
|  |  | ||||||
|     // Actions |     // Actions | ||||||
|     for (int level = 0; level < as.size(); ++level) { |     for (int level = 0; level < as.size(); ++level) { | ||||||
| @@ -496,9 +650,18 @@ public: | |||||||
|  |  | ||||||
|     std::cout << GridLogIntegrator << "Integrator initial action\n"; |     std::cout << GridLogIntegrator << "Integrator initial action\n"; | ||||||
|  |  | ||||||
|     RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom | //    RealD H = - FieldImplementation::FieldSquareNorm(P.Mom)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom | ||||||
|  | //    RealD Hterm; | ||||||
|     RealD Hterm; |     RealD H = - FieldImplementation::FieldSquareNorm(P.Mom)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom | ||||||
|  |     std::cout << GridLogMessage << "S:FieldSquareNorm H_p = " << H << "\n"; | ||||||
|  |     std::cout << GridLogMessage << "S:dSField = " << H-Smom << "\n"; | ||||||
|  |     Smom=H; | ||||||
|  |     P.M.ImportGauge(U); | ||||||
|  |     RealD Hterm = - P.MomentaAction(); | ||||||
|  |     std::cout << GridLogMessage << "S:Momentum action H_p = " << Hterm << "\n"; | ||||||
|  |     std::cout << GridLogMessage << "S:dSMom = " << Hterm-Saux << "\n"; | ||||||
|  |     Saux=Hterm; | ||||||
|  |     H = Hterm; | ||||||
|  |  | ||||||
|     // Actions |     // Actions | ||||||
|     for (int level = 0; level < as.size(); ++level) { |     for (int level = 0; level < as.size(); ++level) { | ||||||
| @@ -521,7 +684,7 @@ public: | |||||||
|   } |   } | ||||||
|  |  | ||||||
|    |    | ||||||
|   void integrate(Field& U)  |   void integrate(Field& U, int traj=-1 )  | ||||||
|   { |   { | ||||||
|     // reset the clocks |     // reset the clocks | ||||||
|     t_U = 0; |     t_U = 0; | ||||||
| @@ -533,6 +696,12 @@ public: | |||||||
|       int first_step = (stp == 0); |       int first_step = (stp == 0); | ||||||
|       int last_step = (stp == Params.MDsteps - 1); |       int last_step = (stp == Params.MDsteps - 1); | ||||||
|       this->step(U, 0, first_step, last_step); |       this->step(U, 0, first_step, last_step); | ||||||
|  |       if (traj>=0){ | ||||||
|  |         std::string file("./config."+std::to_string(traj)+"_"+std::to_string(stp+1) ); | ||||||
|  |         int precision32 = 0; | ||||||
|  |         int tworow      = 0; | ||||||
|  |         NerscIO::writeConfiguration(U,file,tworow,precision32); | ||||||
|  |       } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Check the clocks all match on all levels |     // Check the clocks all match on all levels | ||||||
| @@ -542,7 +711,6 @@ public: | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     FieldImplementation::Project(U); |     FieldImplementation::Project(U); | ||||||
|  |  | ||||||
|     // and that we indeed got to the end of the trajectory |     // and that we indeed got to the end of the trajectory | ||||||
|     assert(fabs(t_U - Params.trajL) < 1.0e-6); |     assert(fabs(t_U - Params.trajL) < 1.0e-6); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -102,8 +102,8 @@ public: | |||||||
|  |  | ||||||
|   std::string integrator_name(){return "LeapFrog";} |   std::string integrator_name(){return "LeapFrog";} | ||||||
|  |  | ||||||
|   LeapFrog(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm) |   LeapFrog(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M) | ||||||
|     : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm){}; |     : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm,M){}; | ||||||
|  |  | ||||||
|   void step(Field& U, int level, int _first, int _last) { |   void step(Field& U, int level, int _first, int _last) { | ||||||
|     int fl = this->as.size() - 1; |     int fl = this->as.size() - 1; | ||||||
| @@ -140,14 +140,14 @@ template <class FieldImplementation_, class SmearingPolicy, class Representation | |||||||
| class MinimumNorm2 : public Integrator<FieldImplementation_, SmearingPolicy, RepresentationPolicy>  | class MinimumNorm2 : public Integrator<FieldImplementation_, SmearingPolicy, RepresentationPolicy>  | ||||||
| { | { | ||||||
| private: | private: | ||||||
|   const RealD lambda = 0.1931833275037836; | //  const RealD lambda = 0.1931833275037836; | ||||||
|  |  | ||||||
| public: | public: | ||||||
|   typedef FieldImplementation_ FieldImplementation; |   typedef FieldImplementation_ FieldImplementation; | ||||||
|   INHERIT_FIELD_TYPES(FieldImplementation); |   INHERIT_FIELD_TYPES(FieldImplementation); | ||||||
|  |  | ||||||
|   MinimumNorm2(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm) |   MinimumNorm2(GridBase* grid, IntegratorParameters Par, ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M) | ||||||
|     : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm){}; |     : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>(grid, Par, Aset, Sm,M){}; | ||||||
|  |  | ||||||
|   std::string integrator_name(){return "MininumNorm2";} |   std::string integrator_name(){return "MininumNorm2";} | ||||||
|  |  | ||||||
| @@ -155,6 +155,11 @@ public: | |||||||
|     // level  : current level |     // level  : current level | ||||||
|     // fl     : final level |     // fl     : final level | ||||||
|     // eps    : current step size |     // eps    : current step size | ||||||
|  |     assert(level<3); | ||||||
|  |     RealD lambda= this->Params.lambda0; | ||||||
|  |     if (level>0) lambda= this->Params.lambda1; | ||||||
|  |     if (level>1) lambda= this->Params.lambda2; | ||||||
|  |     std::cout << GridLogMessage << "level: "<<level<< "lambda: "<<lambda<<std::endl; | ||||||
|  |  | ||||||
|     int fl = this->as.size() - 1; |     int fl = this->as.size() - 1; | ||||||
|  |  | ||||||
| @@ -210,9 +215,9 @@ public: | |||||||
|   // Looks like dH scales as dt^4. tested wilson/wilson 2 level. |   // Looks like dH scales as dt^4. tested wilson/wilson 2 level. | ||||||
|   ForceGradient(GridBase* grid, IntegratorParameters Par, |   ForceGradient(GridBase* grid, IntegratorParameters Par, | ||||||
|                 ActionSet<Field, RepresentationPolicy>& Aset, |                 ActionSet<Field, RepresentationPolicy>& Aset, | ||||||
|                 SmearingPolicy& Sm) |                 SmearingPolicy& Sm, Metric<Field>& M) | ||||||
|     : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>( |     : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>( | ||||||
| 									    grid, Par, Aset, Sm){}; | 									    grid, Par, Aset, Sm,M){}; | ||||||
|  |  | ||||||
|   std::string integrator_name(){return "ForceGradient";} |   std::string integrator_name(){return "ForceGradient";} | ||||||
|    |    | ||||||
| @@ -275,6 +280,255 @@ public: | |||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | //////////////////////////////// | ||||||
|  | // Riemannian Manifold HMC | ||||||
|  | // Girolami et al | ||||||
|  | //////////////////////////////// | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | // correct | ||||||
|  | template <class FieldImplementation, class SmearingPolicy, | ||||||
|  |           class RepresentationPolicy = | ||||||
|  |               Representations<FundamentalRepresentation> > | ||||||
|  | class ImplicitLeapFrog : public Integrator<FieldImplementation, SmearingPolicy, | ||||||
|  |                                            RepresentationPolicy> { | ||||||
|  |  public: | ||||||
|  |   typedef ImplicitLeapFrog<FieldImplementation, SmearingPolicy, RepresentationPolicy> | ||||||
|  |       Algorithm; | ||||||
|  |   INHERIT_FIELD_TYPES(FieldImplementation); | ||||||
|  |  | ||||||
|  |   // Riemannian manifold metric operator | ||||||
|  |   // Hermitian operator Fisher | ||||||
|  |  | ||||||
|  |   std::string integrator_name(){return "ImplicitLeapFrog";} | ||||||
|  |  | ||||||
|  |   ImplicitLeapFrog(GridBase* grid, IntegratorParameters Par, | ||||||
|  |            ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M) | ||||||
|  |       : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>( | ||||||
|  |             grid, Par, Aset, Sm, M){}; | ||||||
|  |  | ||||||
|  |   void step(Field& U, int level, int _first, int _last) { | ||||||
|  |     int fl = this->as.size() - 1; | ||||||
|  |     // level  : current level | ||||||
|  |     // fl     : final level | ||||||
|  |     // eps    : current step size | ||||||
|  |  | ||||||
|  |     // Get current level step size | ||||||
|  |     RealD eps = this->Params.trajL/this->Params.MDsteps; | ||||||
|  |     for (int l = 0; l <= level; ++l) eps /= this->as[l].multiplier; | ||||||
|  |  | ||||||
|  |     int multiplier = this->as[level].multiplier; | ||||||
|  |     for (int e = 0; e < multiplier; ++e) { | ||||||
|  |       int first_step = _first && (e == 0); | ||||||
|  |       int last_step = _last && (e == multiplier - 1); | ||||||
|  |  | ||||||
|  |       if (first_step) {  // initial half step | ||||||
|  |        this->implicit_update_P(U, level, eps / 2.0); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       if (level == fl) {  // lowest level | ||||||
|  |         this->implicit_update_U(U, eps,eps/2.); | ||||||
|  |       } else {  // recursive function call | ||||||
|  |         this->step(U, level + 1, first_step, last_step); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       //int mm = last_step ? 1 : 2; | ||||||
|  |       if (last_step){ | ||||||
|  |         this->update_P2(U, level, eps / 2.0); | ||||||
|  |       } else { | ||||||
|  |       this->implicit_update_P(U, level, eps, true);// works intermediate step | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | template <class FieldImplementation, class SmearingPolicy, | ||||||
|  |           class RepresentationPolicy = | ||||||
|  |               Representations<FundamentalRepresentation> > | ||||||
|  | class ImplicitMinimumNorm2 : public Integrator<FieldImplementation, SmearingPolicy, | ||||||
|  |                                        RepresentationPolicy> { | ||||||
|  |  private: | ||||||
|  | //  const RealD lambda = 0.1931833275037836; | ||||||
|  |  | ||||||
|  |  public: | ||||||
|  |   INHERIT_FIELD_TYPES(FieldImplementation); | ||||||
|  |  | ||||||
|  |   ImplicitMinimumNorm2(GridBase* grid, IntegratorParameters Par, | ||||||
|  |                ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M) | ||||||
|  |       : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>( | ||||||
|  |             grid, Par, Aset, Sm, M){}; | ||||||
|  |  | ||||||
|  |   std::string integrator_name(){return "ImplicitMininumNorm2";} | ||||||
|  |  | ||||||
|  |   void step(Field& U, int level, int _first, int _last) { | ||||||
|  |     // level  : current level | ||||||
|  |     // fl     : final level | ||||||
|  |     // eps    : current step size | ||||||
|  |  | ||||||
|  |     int fl = this->as.size() - 1; | ||||||
|  | //    assert(Params.lambda.size()>level); | ||||||
|  | //    RealD lambda= Params.lambda[level]; | ||||||
|  |     assert(level<3); | ||||||
|  |     RealD lambda= this->Params.lambda0; | ||||||
|  |     if (level>0) lambda= this->Params.lambda1; | ||||||
|  |     if (level>1) lambda= this->Params.lambda2; | ||||||
|  |     std::cout << GridLogMessage << "level: "<<level<< "lambda: "<<lambda<<std::endl; | ||||||
|  |  | ||||||
|  |   if(level<fl){ | ||||||
|  |  | ||||||
|  |     RealD eps = this->Params.trajL/this->Params.MDsteps * 2.0; | ||||||
|  |     for (int l = 0; l <= level; ++l) eps /= 2.0 * this->as[l].multiplier; | ||||||
|  |  | ||||||
|  |     // Nesting:  2xupdate_U of size eps/2 | ||||||
|  |     // Next level is eps/2/multiplier | ||||||
|  |  | ||||||
|  |     int multiplier = this->as[level].multiplier; | ||||||
|  |     for (int e = 0; e < multiplier; ++e) {  // steps per step | ||||||
|  |  | ||||||
|  |       int first_step = _first && (e == 0); | ||||||
|  |       int last_step = _last && (e == multiplier - 1); | ||||||
|  |  | ||||||
|  |       if (first_step) {  // initial half step | ||||||
|  |         this->update_P(U, level, lambda * eps); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |         this->step(U, level + 1, first_step, 0); | ||||||
|  |  | ||||||
|  |       this->update_P(U, level, (1.0 - 2.0 * lambda) * eps); | ||||||
|  |  | ||||||
|  |         this->step(U, level + 1, 0, last_step); | ||||||
|  |  | ||||||
|  |       int mm = (last_step) ? 1 : 2; | ||||||
|  |       this->update_P(U, level, lambda * eps * mm); | ||||||
|  |     } | ||||||
|  |   }  | ||||||
|  |   else  | ||||||
|  |   { // last level | ||||||
|  |     RealD eps = this->Params.trajL/this->Params.MDsteps * 2.0; | ||||||
|  |     for (int l = 0; l <= level; ++l) eps /= 2.0 * this->as[l].multiplier; | ||||||
|  |  | ||||||
|  |     // Nesting:  2xupdate_U of size eps/2 | ||||||
|  |     // Next level is eps/2/multiplier | ||||||
|  |  | ||||||
|  |     int multiplier = this->as[level].multiplier; | ||||||
|  |     for (int e = 0; e < multiplier; ++e) {  // steps per step | ||||||
|  |  | ||||||
|  |       int first_step = _first && (e == 0); | ||||||
|  |       int last_step = _last && (e == multiplier - 1); | ||||||
|  |  | ||||||
|  |       if (first_step) {  // initial half step | ||||||
|  |         this->implicit_update_P(U, level, lambda * eps); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       this->implicit_update_U(U, 0.5 * eps,lambda*eps); | ||||||
|  |  | ||||||
|  |       this->implicit_update_P(U, level, (1.0 - 2.0 * lambda) * eps, true); | ||||||
|  |  | ||||||
|  |       this->implicit_update_U(U, 0.5 * eps, (0.5-lambda)*eps); | ||||||
|  |  | ||||||
|  |       if (last_step) { | ||||||
|  |         this->update_P2(U, level, eps * lambda); | ||||||
|  |       } else { | ||||||
|  |         this->implicit_update_P(U, level, lambda * eps*2.0, true); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | template <class FieldImplementation, class SmearingPolicy, | ||||||
|  |           class RepresentationPolicy = | ||||||
|  |               Representations<FundamentalRepresentation> > | ||||||
|  | class ImplicitCampostrini : public Integrator<FieldImplementation, SmearingPolicy, | ||||||
|  |                                        RepresentationPolicy> { | ||||||
|  |  private: | ||||||
|  | //  const RealD lambda = 0.1931833275037836; | ||||||
|  |  | ||||||
|  |  public: | ||||||
|  |   INHERIT_FIELD_TYPES(FieldImplementation); | ||||||
|  |  | ||||||
|  |   ImplicitCampostrini(GridBase* grid, IntegratorParameters Par, | ||||||
|  |                ActionSet<Field, RepresentationPolicy>& Aset, SmearingPolicy& Sm, Metric<Field>& M) | ||||||
|  |       : Integrator<FieldImplementation, SmearingPolicy, RepresentationPolicy>( | ||||||
|  |             grid, Par, Aset, Sm, M){}; | ||||||
|  |  | ||||||
|  |   std::string integrator_name(){return "ImplicitCampostrini";} | ||||||
|  |  | ||||||
|  |   void step(Field& U, int level, int _first, int _last) { | ||||||
|  |     // level  : current level | ||||||
|  |     // fl     : final level | ||||||
|  |     // eps    : current step size | ||||||
|  |  | ||||||
|  |     int fl = this->as.size() - 1; | ||||||
|  | //    assert(Params.lambda.size()>level); | ||||||
|  | //    RealD lambda= Params.lambda[level]; | ||||||
|  |     assert(level<3); | ||||||
|  |     RealD lambda= this->Params.lambda0; | ||||||
|  |     if (level>0) lambda= this->Params.lambda1; | ||||||
|  |     if (level>1) lambda= this->Params.lambda2; | ||||||
|  |     std::cout << GridLogMessage << "level: "<<level<< "lambda: "<<lambda<<std::endl; | ||||||
|  |      | ||||||
|  |     RealD sigma=pow(2.0,1./3.); | ||||||
|  |  | ||||||
|  |   if(level<fl){ | ||||||
|  | //Still Omelyan. Needs to change step() to accept variable stepsize | ||||||
|  |     RealD eps = this->Params.trajL/this->Params.MDsteps * 2.0; | ||||||
|  |     for (int l = 0; l <= level; ++l) eps /= 2.0 * this->as[l].multiplier; | ||||||
|  |  | ||||||
|  |     // Nesting:  2xupdate_U of size eps/2 | ||||||
|  |     // Next level is eps/2/multiplier | ||||||
|  |  | ||||||
|  |     int multiplier = this->as[level].multiplier; | ||||||
|  |     for (int e = 0; e < multiplier; ++e) {  // steps per step | ||||||
|  |  | ||||||
|  |       int first_step = _first && (e == 0); | ||||||
|  |       int last_step = _last && (e == multiplier - 1); | ||||||
|  |  | ||||||
|  |       if (first_step) {  // initial half step | ||||||
|  |         this->update_P(U, level, lambda * eps); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |         this->step(U, level + 1, first_step, 0); | ||||||
|  |  | ||||||
|  |       this->update_P(U, level, (1.0 - 2.0 * lambda) * eps); | ||||||
|  |  | ||||||
|  |         this->step(U, level + 1, 0, last_step); | ||||||
|  |  | ||||||
|  |       int mm = (last_step) ? 1 : 2; | ||||||
|  |       this->update_P(U, level, lambda * eps * mm); | ||||||
|  |     } | ||||||
|  |   }  | ||||||
|  |   else  | ||||||
|  |   { // last level | ||||||
|  |     RealD dt = this->Params.trajL/this->Params.MDsteps * 2.0; | ||||||
|  |     for (int l = 0; l <= level; ++l) dt /= 2.0 * this->as[l].multiplier; | ||||||
|  |  | ||||||
|  |     RealD epsilon = dt/(2.0 - sigma); | ||||||
|  |  | ||||||
|  |     int multiplier = this->as[level].multiplier; | ||||||
|  |     for (int e = 0; e < multiplier; ++e) {  // steps per step | ||||||
|  |  | ||||||
|  |       int first_step = _first && (e == 0); | ||||||
|  |       int last_step = _last && (e == multiplier - 1); | ||||||
|  |       // initial half step | ||||||
|  |       if (first_step) {  this->implicit_update_P(U, level, epsilon*0.5); } | ||||||
|  |       this->implicit_update_U(U, epsilon,epsilon*0.5); | ||||||
|  |       this->implicit_update_P(U, level, (1.0 - sigma) * epsilon *0.5, epsilon*0.5, true); | ||||||
|  |       this->implicit_update_U(U, -epsilon*sigma, -epsilon*sigma*0.5); | ||||||
|  |       this->implicit_update_P(U, level, (1.0 - sigma) * epsilon *0.5, -epsilon*sigma*0.5, true); | ||||||
|  |       this->implicit_update_U(U, epsilon,epsilon*0.5); | ||||||
|  |       if (last_step) { this->update_P2(U, level, epsilon*0.5 ); }  | ||||||
|  |       else | ||||||
|  |       this->implicit_update_P(U, level, epsilon,epsilon*0.5); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
| #endif  // INTEGRATOR_INCLUDED | #endif  // INTEGRATOR_INCLUDED | ||||||
|   | |||||||
| @@ -1,4 +1,3 @@ | |||||||
|  |  | ||||||
| /*! | /*! | ||||||
|   @file GaugeConfiguration.h |   @file GaugeConfiguration.h | ||||||
|   @brief Declares the GaugeConfiguration class |   @brief Declares the GaugeConfiguration class | ||||||
| @@ -7,15 +6,6 @@ | |||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class T> void Dump(const Lattice<T> & lat, |  | ||||||
| 			    std::string s, |  | ||||||
| 			    Coordinate site = Coordinate({0,0,0,0})) |  | ||||||
| { |  | ||||||
|   typename T::scalar_object tmp; |  | ||||||
|   peekSite(tmp,lat,site); |  | ||||||
|   std::cout << " Dump "<<s<<" "<<tmp<<std::endl; |  | ||||||
| } |  | ||||||
| /*! | /*! | ||||||
|   @brief Smeared configuration masked container |   @brief Smeared configuration masked container | ||||||
|   Modified for a multi-subset smearing (aka Luscher Flowed HMC) |   Modified for a multi-subset smearing (aka Luscher Flowed HMC) | ||||||
| @@ -38,101 +28,6 @@ private: | |||||||
|   typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField; |   typedef typename SU3Adjoint::LatticeAdjMatrix  AdjMatrixField; | ||||||
|   typedef typename SU3Adjoint::LatticeAdjVector  AdjVectorField; |   typedef typename SU3Adjoint::LatticeAdjVector  AdjVectorField; | ||||||
|  |  | ||||||
|   void BaseSmearDerivative(GaugeField& SigmaTerm, |  | ||||||
| 			   const GaugeField& iLambda, |  | ||||||
| 			   const GaugeField& U, |  | ||||||
| 			   int mmu, RealD rho) |  | ||||||
|   { |  | ||||||
|     // Reference |  | ||||||
|     // Morningstar, Peardon, Phys.Rev.D69,054501(2004) |  | ||||||
|     // Equation 75 |  | ||||||
|     // Computing Sigma_mu, derivative of S[fat links] with respect to the thin links |  | ||||||
|     // Output SigmaTerm |  | ||||||
|  |  | ||||||
|     GridBase *grid = U.Grid(); |  | ||||||
|  |  | ||||||
|     WilsonLoops<Gimpl> WL; |  | ||||||
|     GaugeLinkField staple(grid), u_tmp(grid); |  | ||||||
|     GaugeLinkField iLambda_mu(grid), iLambda_nu(grid); |  | ||||||
|     GaugeLinkField U_mu(grid), U_nu(grid); |  | ||||||
|     GaugeLinkField sh_field(grid), temp_Sigma(grid); |  | ||||||
|     Real rho_munu, rho_numu; |  | ||||||
|  |  | ||||||
|     rho_munu = rho; |  | ||||||
|     rho_numu = rho; |  | ||||||
|     for(int mu = 0; mu < Nd; ++mu){ |  | ||||||
|       U_mu       = peekLorentz(      U, mu); |  | ||||||
|       iLambda_mu = peekLorentz(iLambda, mu); |  | ||||||
|  |  | ||||||
|       for(int nu = 0; nu < Nd; ++nu){ |  | ||||||
| 	if(nu==mu) continue; |  | ||||||
|  |  | ||||||
| 	U_nu       = peekLorentz(      U, nu); |  | ||||||
|  |  | ||||||
| 	// Nd(nd-1) = 12 staples normally. |  | ||||||
| 	// We must compute 6 of these |  | ||||||
| 	// in FTHMC case |  | ||||||
| 	if ( (mu==mmu)||(nu==mmu) ) |  | ||||||
| 	  WL.StapleUpper(staple, U, mu, nu); |  | ||||||
| 	 |  | ||||||
| 	if(nu==mmu) { |  | ||||||
| 	  iLambda_nu = peekLorentz(iLambda, nu); |  | ||||||
|  |  | ||||||
| 	  temp_Sigma = -rho_numu*staple*iLambda_nu;  //ok |  | ||||||
| 	  //-r_numu*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x)*Lambda_nu(x) |  | ||||||
| 	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); |  | ||||||
|  |  | ||||||
| 	  sh_field = Cshift(iLambda_nu, mu, 1);// general also for Gparity? |  | ||||||
|  |  | ||||||
| 	  temp_Sigma = rho_numu*sh_field*staple; //ok |  | ||||||
| 	  //r_numu*Lambda_nu(mu)*U_nu(x+mu)*Udag_mu(x+nu)*Udag_nu(x) |  | ||||||
| 	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); |  | ||||||
| 	} |  | ||||||
|  |  | ||||||
| 	if ( mu == mmu ) {  |  | ||||||
| 	  sh_field = Cshift(iLambda_mu, nu, 1); |  | ||||||
|  |  | ||||||
| 	  temp_Sigma = -rho_munu*staple*U_nu*sh_field*adj(U_nu); //ok |  | ||||||
| 	  //-r_munu*U_nu(x+mu)*Udag_mu(x+nu)*Lambda_mu(x+nu)*Udag_nu(x) |  | ||||||
| 	  Gimpl::AddLink(SigmaTerm, temp_Sigma, mu); |  | ||||||
| 	} |  | ||||||
|  |  | ||||||
| 	//	staple = Zero(); |  | ||||||
| 	sh_field = Cshift(U_nu, mu, 1); |  | ||||||
|  |  | ||||||
| 	temp_Sigma = Zero(); |  | ||||||
|  |  | ||||||
| 	if ( mu == mmu ) |  | ||||||
| 	  temp_Sigma = -rho_munu*adj(sh_field)*adj(U_mu)*iLambda_mu*U_nu; |  | ||||||
|  |  | ||||||
| 	if ( nu == mmu ) { |  | ||||||
| 	  temp_Sigma += rho_numu*adj(sh_field)*adj(U_mu)*iLambda_nu*U_nu; |  | ||||||
|  |  | ||||||
| 	  u_tmp = adj(U_nu)*iLambda_nu; |  | ||||||
| 	  sh_field = Cshift(u_tmp, mu, 1); |  | ||||||
| 	  temp_Sigma += -rho_numu*sh_field*adj(U_mu)*U_nu; |  | ||||||
| 	} |  | ||||||
| 	 |  | ||||||
| 	sh_field = Cshift(temp_Sigma, nu, -1); |  | ||||||
| 	Gimpl::AddLink(SigmaTerm, sh_field, mu); |  | ||||||
|  |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   void BaseSmear(GaugeLinkField& Cup, const GaugeField& U,int mu,RealD rho) { |  | ||||||
|     GridBase *grid = U.Grid(); |  | ||||||
|     GaugeLinkField tmp_stpl(grid); |  | ||||||
|     WilsonLoops<Gimpl> WL; |  | ||||||
|     Cup = Zero(); |  | ||||||
|     for(int nu=0; nu<Nd; ++nu){ |  | ||||||
|       if (nu != mu) { |  | ||||||
| 	// get the staple in direction mu, nu |  | ||||||
| 	WL.Staple(tmp_stpl, U, mu, nu);  //nb staple conventions of IroIro and Grid differ by a dagger |  | ||||||
| 	Cup += adj(tmp_stpl*rho); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   // Adjoint vector to GaugeField force |   // Adjoint vector to GaugeField force | ||||||
|   void InsertForce(GaugeField &Fdet,AdjVectorField &Fdet_nu,int nu) |   void InsertForce(GaugeField &Fdet,AdjVectorField &Fdet_nu,int nu) | ||||||
|   { |   { | ||||||
| @@ -152,54 +47,27 @@ private: | |||||||
|     GaugeLinkField UtaU(PlaqL.Grid()); |     GaugeLinkField UtaU(PlaqL.Grid()); | ||||||
|     GaugeLinkField D(PlaqL.Grid()); |     GaugeLinkField D(PlaqL.Grid()); | ||||||
|     AdjMatrixField Dbc(PlaqL.Grid()); |     AdjMatrixField Dbc(PlaqL.Grid()); | ||||||
|     AdjMatrixField Dbc_opt(PlaqL.Grid()); |  | ||||||
|     LatticeComplex tmp(PlaqL.Grid()); |     LatticeComplex tmp(PlaqL.Grid()); | ||||||
|     const int Ngen = SU3Adjoint::Dimension; |     const int Ngen = SU3Adjoint::Dimension; | ||||||
|     Complex ci(0,1); |     Complex ci(0,1); | ||||||
|     ColourMatrix   ta,tb,tc; |     ColourMatrix   ta,tb,tc; | ||||||
|     RealD t=0; |      | ||||||
|     RealD tp=0; |  | ||||||
|     RealD tta=0; |  | ||||||
|     RealD tpk=0; |  | ||||||
|     t-=usecond(); |  | ||||||
|     for(int a=0;a<Ngen;a++) { |     for(int a=0;a<Ngen;a++) { | ||||||
|       tta-=usecond(); |  | ||||||
|       SU3::generator(a, ta); |       SU3::generator(a, ta); | ||||||
|       ta = 2.0 * ci * ta; |  | ||||||
|       // Qlat Tb = 2i Tb^Grid |       // Qlat Tb = 2i Tb^Grid | ||||||
|       UtaU= adj(PlaqL)*ta*PlaqR; // 6ms |       UtaU= 2.0*ci*adj(PlaqL)*ta*PlaqR; | ||||||
|       tta+=usecond(); |  | ||||||
|       //////////////////////////////////////////// |  | ||||||
|       // Could add this entire C-loop to a projection routine |  | ||||||
|       // for performance. Could also pick checkerboard on UtaU |  | ||||||
|       // and set checkerboard on result for 2x perf |  | ||||||
|       //////////////////////////////////////////// |  | ||||||
|       for(int c=0;c<Ngen;c++) { |       for(int c=0;c<Ngen;c++) { | ||||||
| 	SU3::generator(c, tc); | 	SU3::generator(c, tc); | ||||||
| 	tc = 2.0*ci*tc; | 	D = Ta( (2.0)*ci*tc *UtaU); | ||||||
| 	tp-=usecond();  |  | ||||||
| 	D = Ta( tc *UtaU); // 2ms |  | ||||||
| #if 1 |  | ||||||
| 	SU3::LieAlgebraProject(Dbc_opt,D,c); // 5.5ms |  | ||||||
| #else |  | ||||||
| 	for(int b=0;b<Ngen;b++){ | 	for(int b=0;b<Ngen;b++){ | ||||||
| 	  SU3::generator(b, tb); | 	  SU3::generator(b, tb); | ||||||
| 	  tmp =-trace(ci*tb*D);  | 	  tmp =-trace(ci*tb*D);  | ||||||
| 	  PokeIndex<ColourIndex>(Dbc,tmp,b,c);  // Adjoint rep | 	  PokeIndex<ColourIndex>(Dbc,tmp,b,c);  // Adjoint rep | ||||||
| 	} | 	} | ||||||
| #endif |  | ||||||
| 	tp+=usecond(); |  | ||||||
|       } |       } | ||||||
|       //      Dump(Dbc_opt,"Dbc_opt"); |       tmp = trace(MpInvJx * Dbc); | ||||||
|       //      Dump(Dbc,"Dbc"); |  | ||||||
|       tpk-=usecond(); |  | ||||||
|       tmp = trace(MpInvJx * Dbc_opt); |  | ||||||
|       PokeIndex<ColourIndex>(Fdet2,tmp,a); |       PokeIndex<ColourIndex>(Fdet2,tmp,a); | ||||||
|       tpk+=usecond(); |  | ||||||
|     } |     } | ||||||
|     t+=usecond(); |  | ||||||
|     std::cout << GridLogPerformance << " Compute_MpInvJx_dNxxdSy " << t/1e3 << " ms  proj "<<tp/1e3<< " ms" |  | ||||||
| 	      << " ta "<<tta/1e3<<" ms" << " poke "<<tpk/1e3<< " ms"<<std::endl; |  | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   void ComputeNxy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR,AdjMatrixField &NxAd) |   void ComputeNxy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR,AdjMatrixField &NxAd) | ||||||
| @@ -211,17 +79,12 @@ private: | |||||||
|     ColourMatrix   tc; |     ColourMatrix   tc; | ||||||
|     for(int b=0;b<Ngen;b++) { |     for(int b=0;b<Ngen;b++) { | ||||||
|       SU3::generator(b, tb); |       SU3::generator(b, tb); | ||||||
|       tb = 2.0 * ci * tb; |       Nx = (2.0)*Ta( adj(PlaqL)*ci*tb * PlaqR ); | ||||||
|       Nx = Ta( adj(PlaqL)*tb * PlaqR ); |  | ||||||
| #if 1 |  | ||||||
|       SU3::LieAlgebraProject(NxAd,Nx,b); |  | ||||||
| #else |  | ||||||
|       for(int c=0;c<Ngen;c++) { |       for(int c=0;c<Ngen;c++) { | ||||||
| 	SU3::generator(c, tc); | 	SU3::generator(c, tc); | ||||||
| 	auto tmp =closure( -trace(ci*tc*Nx));  | 	auto tmp =closure( -trace(ci*tc*Nx));  | ||||||
| 	PokeIndex<ColourIndex>(NxAd,tmp,c,b);  | 	PokeIndex<ColourIndex>(NxAd,tmp,c,b);  | ||||||
|       } |       } | ||||||
| #endif |  | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   void ApplyMask(GaugeField &U,int smr) |   void ApplyMask(GaugeField &U,int smr) | ||||||
| @@ -301,7 +164,8 @@ public: | |||||||
|     // Computes ALL the staples -- could compute one only and do it here |     // Computes ALL the staples -- could compute one only and do it here | ||||||
|     RealD time; |     RealD time; | ||||||
|     time=-usecond(); |     time=-usecond(); | ||||||
|     BaseSmear(Cmu, U,mu,rho); |     this->StoutSmearing->BaseSmear(C, U); | ||||||
|  |     Cmu = peekLorentz(C, mu); | ||||||
|  |  | ||||||
|     ////////////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////////////// | ||||||
|     // Assemble Luscher exp diff map J matrix  |     // Assemble Luscher exp diff map J matrix  | ||||||
| @@ -345,36 +209,6 @@ public: | |||||||
|     // dJ(x)/dxe |     // dJ(x)/dxe | ||||||
|     ////////////////////////////////////// |     ////////////////////////////////////// | ||||||
|     time=-usecond(); |     time=-usecond(); | ||||||
| #if 1 |  | ||||||
|     std::vector<AdjMatrixField>  dJdX;    dJdX.resize(8,grid); |  | ||||||
|     std::vector<AdjMatrix> TRb_s; TRb_s.resize(8); |  | ||||||
|     AdjMatrixField tbXn(grid); |  | ||||||
|     AdjMatrixField sumXtbX(grid); |  | ||||||
|     AdjMatrixField t2(grid); |  | ||||||
|     AdjMatrixField dt2(grid); |  | ||||||
|     AdjMatrixField t3(grid); |  | ||||||
|     AdjMatrixField dt3(grid); |  | ||||||
|     AdjMatrixField aunit(grid); |  | ||||||
|  |  | ||||||
|     for(int b=0;b<8;b++){ |  | ||||||
|       SU3Adjoint::generator(b, TRb_s[b]); |  | ||||||
|       dJdX[b] = TRb_s[b]; |  | ||||||
|     } |  | ||||||
|     aunit = ComplexD(1.0); |  | ||||||
|     // Could put into an accelerator_for |  | ||||||
|     X  = (-1.0)*ZxAd;  |  | ||||||
|     t2 = X; |  | ||||||
|     for (int j = 12; j > 1; --j) { |  | ||||||
|       t3  = t2*(1.0 / (j + 1))  + aunit; |  | ||||||
|       t2  = X * t3; |  | ||||||
|       for(int b=0;b<8;b++){ |  | ||||||
| 	dJdX[b]= TRb_s[b] * t3 + X * dJdX[b]*(1.0 / (j + 1)); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     for(int b=0;b<8;b++){ |  | ||||||
|       dJdX[b] = -dJdX[b]; |  | ||||||
|     } |  | ||||||
| #else |  | ||||||
|     std::vector<AdjMatrixField>  dJdX;    dJdX.resize(8,grid); |     std::vector<AdjMatrixField>  dJdX;    dJdX.resize(8,grid); | ||||||
|     AdjMatrixField tbXn(grid); |     AdjMatrixField tbXn(grid); | ||||||
|     AdjMatrixField sumXtbX(grid); |     AdjMatrixField sumXtbX(grid); | ||||||
| @@ -390,15 +224,14 @@ public: | |||||||
|       X  = (-1.0)*ZxAd;  |       X  = (-1.0)*ZxAd;  | ||||||
|       t2 = X; |       t2 = X; | ||||||
|       dt2 = TRb; |       dt2 = TRb; | ||||||
|       for (int j = 12; j > 1; --j) { |       for (int j = 20; j > 1; --j) { | ||||||
| 	t3  = t2*(1.0 / (j + 1))  + aunit; | 	t3 = t2*(1.0 / (j + 1))  + aunit; | ||||||
| 	dt3 = dt2*(1.0 / (j + 1)); | 	dt3 = dt2*(1.0 / (j + 1)); | ||||||
| 	t2 = X * t3; | 	t2 = X * t3; | ||||||
| 	dt2 = TRb * t3 + X * dt3; | 	dt2 = TRb * t3 + X * dt3; | ||||||
|       } |       } | ||||||
|       dJdX[b] = -dt2;  |       dJdX[b] = -dt2;  | ||||||
|     } |     } | ||||||
| #endif   |  | ||||||
|     time+=usecond(); |     time+=usecond(); | ||||||
|     std::cout << GridLogMessage << "dJx took "<<time<< " us"<<std::endl; |     std::cout << GridLogMessage << "dJx took "<<time<< " us"<<std::endl; | ||||||
|     ///////////////////////////////////////////////////////////////// |     ///////////////////////////////////////////////////////////////// | ||||||
| @@ -448,8 +281,8 @@ public: | |||||||
|      |      | ||||||
|     for(int e =0 ; e<8 ; e++){ |     for(int e =0 ; e<8 ; e++){ | ||||||
|       LatticeComplexD tr(grid); |       LatticeComplexD tr(grid); | ||||||
|       //      ColourMatrix te; |       ColourMatrix te; | ||||||
|       //      SU3::generator(e, te); |       SU3::generator(e, te); | ||||||
|       tr = trace(dJdX[e] * nMpInv); |       tr = trace(dJdX[e] * nMpInv); | ||||||
|       pokeColour(dJdXe_nMpInv,tr,e); |       pokeColour(dJdXe_nMpInv,tr,e); | ||||||
|     } |     } | ||||||
| @@ -660,25 +493,20 @@ public: | |||||||
|     ////////////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////////////// | ||||||
|     // Assemble the N matrix |     // Assemble the N matrix | ||||||
|     ////////////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////////////// | ||||||
|     double rho=this->StoutSmearing->SmearRho[1]; |     // Computes ALL the staples -- could compute one only here | ||||||
|     BaseSmear(Cmu, U,mu,rho); |     this->StoutSmearing->BaseSmear(C, U); | ||||||
|  |     Cmu = peekLorentz(C, mu); | ||||||
|     Umu = peekLorentz(U, mu); |     Umu = peekLorentz(U, mu); | ||||||
|     Complex ci(0,1); |     Complex ci(0,1); | ||||||
|     for(int b=0;b<Ngen;b++) { |     for(int b=0;b<Ngen;b++) { | ||||||
|       SU3::generator(b, Tb); |       SU3::generator(b, Tb); | ||||||
|       // Qlat Tb = 2i Tb^Grid |       // Qlat Tb = 2i Tb^Grid | ||||||
|       Nb = (2.0)*Ta( ci*Tb * Umu * adj(Cmu)); |       Nb = (2.0)*Ta( ci*Tb * Umu * adj(Cmu)); | ||||||
|       // FIXME -- replace this with LieAlgebraProject |  | ||||||
| #if 0 |  | ||||||
|       SU3::LieAlgebraProject(Ncb,tmp,b); |  | ||||||
| #else |  | ||||||
|       for(int c=0;c<Ngen;c++) { |       for(int c=0;c<Ngen;c++) { | ||||||
| 	SU3::generator(c, Tc); | 	SU3::generator(c, Tc); | ||||||
| 	auto tmp = -trace(ci*Tc*Nb); // Luchang's norm: (2Tc) (2Td) N^db = -2 delta cd N^db // - was important | 	auto tmp = -trace(ci*Tc*Nb); // Luchang's norm: (2Tc) (2Td) N^db = -2 delta cd N^db // - was important | ||||||
| 	PokeIndex<ColourIndex>(Ncb,tmp,c,b);  | 	PokeIndex<ColourIndex>(Ncb,tmp,c,b);  | ||||||
|       } |       } | ||||||
| #endif |  | ||||||
|     }       |     }       | ||||||
|  |  | ||||||
|     ////////////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////////////// | ||||||
| @@ -865,19 +693,15 @@ private: | |||||||
| 					  const GaugeField& GaugeK,int level)  | 					  const GaugeField& GaugeK,int level)  | ||||||
|   { |   { | ||||||
|     GridBase* grid = GaugeK.Grid(); |     GridBase* grid = GaugeK.Grid(); | ||||||
|     GaugeField SigmaK(grid), iLambda(grid); |     GaugeField C(grid), SigmaK(grid), iLambda(grid); | ||||||
|     GaugeField SigmaKPrimeA(grid); |     GaugeField SigmaKPrimeA(grid); | ||||||
|     GaugeField SigmaKPrimeB(grid); |     GaugeField SigmaKPrimeB(grid); | ||||||
|     GaugeLinkField iLambda_mu(grid); |     GaugeLinkField iLambda_mu(grid); | ||||||
|     GaugeLinkField iQ(grid), e_iQ(grid); |     GaugeLinkField iQ(grid), e_iQ(grid); | ||||||
|     GaugeLinkField SigmaKPrime_mu(grid); |     GaugeLinkField SigmaKPrime_mu(grid); | ||||||
|     GaugeLinkField GaugeKmu(grid), Cmu(grid); |     GaugeLinkField GaugeKmu(grid), Cmu(grid); | ||||||
|  |      | ||||||
|     int mmu= (level/2) %Nd; |     this->StoutSmearing->BaseSmear(C, GaugeK); | ||||||
|     int cb= (level%2); |  | ||||||
|     double rho=this->StoutSmearing->SmearRho[1]; |  | ||||||
|  |  | ||||||
|     // Can override this to do one direction only. |  | ||||||
|     SigmaK = Zero(); |     SigmaK = Zero(); | ||||||
|     iLambda = Zero(); |     iLambda = Zero(); | ||||||
|  |  | ||||||
| @@ -888,38 +712,18 @@ private: | |||||||
|     // Could get away with computing only one polarisation here |     // Could get away with computing only one polarisation here | ||||||
|     // int mu= (smr/2) %Nd; |     // int mu= (smr/2) %Nd; | ||||||
|     // SigmaKprime_A has only one component |     // SigmaKprime_A has only one component | ||||||
| #if 0 |     for (int mu = 0; mu < Nd; mu++) | ||||||
|     BaseSmear(Cmu, GaugeK,mu,rho); |  | ||||||
|     GaugeKmu = peekLorentz(GaugeK, mu); |  | ||||||
|     SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu); |  | ||||||
|     iQ = Ta(Cmu * adj(GaugeKmu)); |  | ||||||
|     this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu); |  | ||||||
|     pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu); |  | ||||||
|     pokeLorentz(iLambda, iLambda_mu, mu); |  | ||||||
|     BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho);  // derivative of SmearBase |  | ||||||
| #else |  | ||||||
|     //    GaugeField C(grid); |  | ||||||
|     //    this->StoutSmearing->BaseSmear(C, GaugeK); |  | ||||||
|     //    for (int mu = 0; mu < Nd; mu++) |  | ||||||
|     int mu =mmu; |  | ||||||
|     BaseSmear(Cmu, GaugeK,mu,rho); |  | ||||||
|     { |     { | ||||||
|       // Cmu = peekLorentz(C, mu); |       Cmu = peekLorentz(C, mu); | ||||||
|       GaugeKmu = peekLorentz(GaugeK, mu); |       GaugeKmu = peekLorentz(GaugeK, mu); | ||||||
|       SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu); |       SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu); | ||||||
|       iQ = Ta(Cmu * adj(GaugeKmu)); |       iQ = Ta(Cmu * adj(GaugeKmu)); | ||||||
|       this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu); |       this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu); | ||||||
|       pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu); |       pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu); | ||||||
|       pokeLorentz(iLambda, iLambda_mu, mu); |       pokeLorentz(iLambda, iLambda_mu, mu); | ||||||
|       std::cout << " mu "<<mu<<" SigmaKPrime_mu"<<norm2(SigmaKPrime_mu)<< " iLambda_mu " <<norm2(iLambda_mu)<<std::endl; |  | ||||||
|     } |     } | ||||||
|     //    GaugeField SigmaKcopy(grid); |     this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK);  // derivative of SmearBase | ||||||
|     //    SigmaKcopy = SigmaK; |  | ||||||
|     BaseSmearDerivative(SigmaK, iLambda,GaugeK,mu,rho);  // derivative of SmearBase |  | ||||||
|     //    this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK);  // derivative of SmearBase |  | ||||||
|     //    SigmaKcopy = SigmaKcopy - SigmaK; |  | ||||||
|     //    std::cout << " BaseSmearDerivative fast path error" <<norm2(SigmaKcopy)<<std::endl; |  | ||||||
| #endif |  | ||||||
|     //////////////////////////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////////////////////////// | ||||||
|     // propagate the rest of the force as identity map, just add back |     // propagate the rest of the force as identity map, just add back | ||||||
|     //////////////////////////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -1,389 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
| Grid physics library, www.github.com/paboyle/Grid |  | ||||||
|  |  | ||||||
| Source file: ./lib/qcd/smearing/HISQSmearing.h |  | ||||||
|  |  | ||||||
| Copyright (C) 2023 |  | ||||||
|  |  | ||||||
| Author: D. A. Clarke <clarke.davida@gmail.com>  |  | ||||||
|  |  | ||||||
| This program is free software; you can redistribute it and/or modify |  | ||||||
| it under the terms of the GNU General Public License as published by |  | ||||||
| the Free Software Foundation; either version 2 of the License, or |  | ||||||
| (at your option) any later version. |  | ||||||
|  |  | ||||||
| This program is distributed in the hope that it will be useful, |  | ||||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| GNU General Public License for more details. |  | ||||||
|  |  | ||||||
| You should have received a copy of the GNU General Public License along |  | ||||||
| with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
| See the full license in the file "LICENSE" in the top level distribution |  | ||||||
| directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /* |  | ||||||
|     @file HISQSmearing.h |  | ||||||
|     @brief Declares classes related to HISQ smearing  |  | ||||||
| */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #pragma once |  | ||||||
| #include <Grid/Grid.h> |  | ||||||
| #include <Grid/lattice/PaddedCell.h> |  | ||||||
| #include <Grid/stencil/GeneralLocalStencil.h> |  | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); |  | ||||||
|  |  | ||||||
|  |  | ||||||
| // TODO: find a way to fold this into the stencil header. need to access grid to get |  | ||||||
| // Nd, since you don't want to inherit from QCD.h |  | ||||||
| /*!  @brief append arbitrary shift path to shifts */ |  | ||||||
| template<typename... Args> |  | ||||||
| void appendShift(std::vector<Coordinate>& shifts, int dir, Args... args) { |  | ||||||
|     Coordinate shift(Nd,0); |  | ||||||
|     generalShift(shift, dir, args...);  |  | ||||||
|     // push_back creates an element at the end of shifts and |  | ||||||
|     // assigns the data in the argument to it. |  | ||||||
|     shifts.push_back(shift); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| /*!  @brief figure out the stencil index from mu and nu */ |  | ||||||
| accelerator_inline int stencilIndex(int mu, int nu) { |  | ||||||
|     // Nshifts depends on how you built the stencil |  | ||||||
|     int Nshifts = 6; |  | ||||||
|     return Nshifts*nu + Nd*Nshifts*mu; |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| /*!  @brief structure holding the link treatment */ |  | ||||||
| struct SmearingParameters{ |  | ||||||
|     SmearingParameters(){} |  | ||||||
|     Real c_1;               // 1 link |  | ||||||
|     Real c_naik;            // Naik term |  | ||||||
|     Real c_3;               // 3 link |  | ||||||
|     Real c_5;               // 5 link |  | ||||||
|     Real c_7;               // 7 link |  | ||||||
|     Real c_lp;              // 5 link Lepage |  | ||||||
|     SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)  |  | ||||||
|         : c_1(c1), |  | ||||||
|           c_naik(cnaik), |  | ||||||
|           c_3(c3), |  | ||||||
|           c_5(c5), |  | ||||||
|           c_7(c7), |  | ||||||
|           c_lp(clp){} |  | ||||||
| }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
| /*!  @brief create fat links from link variables */ |  | ||||||
| template<class Gimpl>  |  | ||||||
| class Smear_HISQ : public Gimpl { |  | ||||||
|  |  | ||||||
| private: |  | ||||||
|     GridCartesian* const _grid; |  | ||||||
|     SmearingParameters _linkTreatment; |  | ||||||
|  |  | ||||||
| public: |  | ||||||
|  |  | ||||||
|     INHERIT_GIMPL_TYPES(Gimpl); |  | ||||||
|     typedef typename Gimpl::GaugeField     GF; |  | ||||||
|     typedef typename Gimpl::GaugeLinkField LF; |  | ||||||
|     typedef typename Gimpl::ComplexField   CF; |  | ||||||
|  |  | ||||||
|     // Don't allow default values here. |  | ||||||
|     Smear_HISQ(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)  |  | ||||||
|         : _grid(grid),  |  | ||||||
|           _linkTreatment(c1,cnaik,c3,c5,c7,clp) { |  | ||||||
|         assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); |  | ||||||
|         assert(Nd == 4 && "HISQ smearing only defined for Nd==4"); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // Allow to pass a pointer to a C-style, double array for MILC convenience |  | ||||||
|     Smear_HISQ(GridCartesian* grid, double* coeff)  |  | ||||||
|         : _grid(grid),  |  | ||||||
|           _linkTreatment(coeff[0],coeff[1],coeff[2],coeff[3],coeff[4],coeff[5]) { |  | ||||||
|         assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3"); |  | ||||||
|         assert(Nd == 4 && "HISQ smearing only defined for Nd==4"); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     ~Smear_HISQ() {} |  | ||||||
|  |  | ||||||
|     // Intent: OUT--u_smr, u_naik |  | ||||||
|     //          IN--u_thin |  | ||||||
|     void smear(GF& u_smr, GF& u_naik, GF& u_thin) const { |  | ||||||
|  |  | ||||||
|         SmearingParameters lt = this->_linkTreatment; |  | ||||||
|         auto grid = this->_grid; |  | ||||||
|  |  | ||||||
|         // Create a padded cell of extra padding depth=1 and fill the padding. |  | ||||||
|         int depth = 1; |  | ||||||
|         PaddedCell Ghost(depth,grid); |  | ||||||
|         GF Ughost = Ghost.Exchange(u_thin); |  | ||||||
|  |  | ||||||
|         // This is where auxiliary N-link fields and the final smear will be stored.  |  | ||||||
|         GF Ughost_fat(Ughost.Grid()); |  | ||||||
|         GF Ughost_3link(Ughost.Grid()); |  | ||||||
|         GF Ughost_5linkA(Ughost.Grid()); |  | ||||||
|         GF Ughost_5linkB(Ughost.Grid()); |  | ||||||
|  |  | ||||||
|         // mu-nu plane stencil. We allow mu==nu to make indexing the stencil easier, |  | ||||||
|         // but these entries will not be used.  |  | ||||||
|         std::vector<Coordinate> shifts; |  | ||||||
|         for(int mu=0;mu<Nd;mu++) |  | ||||||
|         for(int nu=0;nu<Nd;nu++) { |  | ||||||
|             appendShift(shifts,mu); |  | ||||||
|             appendShift(shifts,nu); |  | ||||||
|             appendShift(shifts,shiftSignal::NO_SHIFT); |  | ||||||
|             appendShift(shifts,mu,Back(nu)); |  | ||||||
|             appendShift(shifts,Back(nu)); |  | ||||||
|             appendShift(shifts,Back(mu)); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         // A GeneralLocalStencil has two indices: a site and stencil index  |  | ||||||
|         GeneralLocalStencil gStencil(Ughost.Grid(),shifts); |  | ||||||
|  |  | ||||||
|         // This is where contributions from the smearing get added together |  | ||||||
|         Ughost_fat=Zero(); |  | ||||||
|  |  | ||||||
|         // This loop handles 3-, 5-, and 7-link constructs, minus Lepage and Naik. |  | ||||||
|         for(int mu=0;mu<Nd;mu++) { |  | ||||||
|  |  | ||||||
|             // TODO: This approach is slightly memory inefficient. It uses 25% extra memory  |  | ||||||
|             Ughost_3link =Zero(); |  | ||||||
|             Ughost_5linkA=Zero(); |  | ||||||
|             Ughost_5linkB=Zero(); |  | ||||||
|  |  | ||||||
|             // Create the accessors |  | ||||||
|             autoView(U_v       , Ughost       , AcceleratorRead); |  | ||||||
|             autoView(U_fat_v   , Ughost_fat   , AcceleratorWrite); |  | ||||||
|             autoView(U_3link_v , Ughost_3link , AcceleratorWrite); |  | ||||||
|             autoView(U_5linkA_v, Ughost_5linkA, AcceleratorWrite); |  | ||||||
|             autoView(U_5linkB_v, Ughost_5linkB, AcceleratorWrite); |  | ||||||
|  |  | ||||||
|             // We infer some types that will be needed in the calculation. |  | ||||||
|             typedef decltype(gStencil.GetEntry(0,0)) stencilElement; |  | ||||||
|             typedef decltype(coalescedReadGeneralPermute(U_v[0](0),gStencil.GetEntry(0,0)->_permute,Nd)) U3matrix; |  | ||||||
|  |  | ||||||
|             int Nsites = U_v.size(); |  | ||||||
|             auto gStencil_v = gStencil.View();  |  | ||||||
|  |  | ||||||
|             accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 3-link constructs |  | ||||||
|                 stencilElement SE0, SE1, SE2, SE3, SE4, SE5; |  | ||||||
|                 U3matrix U0, U1, U2, U3, U4, U5, W; |  | ||||||
|                 for(int nu=0;nu<Nd;nu++) { |  | ||||||
|                     if(nu==mu) continue; |  | ||||||
|                     int s = stencilIndex(mu,nu); |  | ||||||
|  |  | ||||||
|                     // The stencil gives us support points in the mu-nu plane that we will use to |  | ||||||
|                     // grab the links we need. |  | ||||||
|                     SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu      = SE0->_offset; |  | ||||||
|                     SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu      = SE1->_offset; |  | ||||||
|                     SE2 = gStencil_v.GetEntry(s+2,site); int x           = SE2->_offset; |  | ||||||
|                     SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; |  | ||||||
|                     SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu      = SE4->_offset; |  | ||||||
|                     SE5 = gStencil_v.GetEntry(s+5,site); int x_m_mu      = SE5->_offset; |  | ||||||
|  |  | ||||||
|                     // When you're deciding whether to take an adjoint, the question is: how is the |  | ||||||
|                     // stored link oriented compared to the one you want? If I imagine myself travelling |  | ||||||
|                     // with the to-be-updated link, I have two possible, alternative 3-link paths I can |  | ||||||
|                     // take, one starting by going to the left, the other starting by going to the right. |  | ||||||
|                     U0 = coalescedReadGeneralPermute(U_v[x_p_mu     ](nu),SE0->_permute,Nd); |  | ||||||
|                     U1 = coalescedReadGeneralPermute(U_v[x_p_nu     ](mu),SE1->_permute,Nd); |  | ||||||
|                     U2 = coalescedReadGeneralPermute(U_v[x          ](nu),SE2->_permute,Nd); |  | ||||||
|                     U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd); |  | ||||||
|                     U4 = coalescedReadGeneralPermute(U_v[x_m_nu     ](mu),SE4->_permute,Nd); |  | ||||||
|                     U5 = coalescedReadGeneralPermute(U_v[x_m_nu     ](nu),SE4->_permute,Nd); |  | ||||||
|  |  | ||||||
|                     //  "left"          "right" |  | ||||||
|                     W = U2*U1*adj(U0) + adj(U5)*U4*U3; |  | ||||||
|  |  | ||||||
|                     // Save 3-link construct for later and add to smeared field. |  | ||||||
|                     coalescedWrite(U_3link_v[x](nu), W); |  | ||||||
|  |  | ||||||
|                     // The index operator (x) returns the coalesced read on GPU. The view [] index returns  |  | ||||||
|                     // a reference to the vector object. The [x](mu) returns a reference to the densely  |  | ||||||
|                     // packed (contiguous in memory) mu-th element of the vector object. On CPU,  |  | ||||||
|                     // coalescedRead/Write is the identity mapping assigning vector object to vector object. |  | ||||||
|                     // But on GPU it's non-trivial and maps scalar object to vector object and vice versa. |  | ||||||
|                     coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_3*W); |  | ||||||
|                 } |  | ||||||
|             }) |  | ||||||
|  |  | ||||||
|             accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 5-link  |  | ||||||
|                 stencilElement SE0, SE1, SE2, SE3, SE4, SE5; |  | ||||||
|                 U3matrix U0, U1, U2, U3, U4, U5, W; |  | ||||||
|                 int sigmaIndex = 0; |  | ||||||
|                 for(int nu=0;nu<Nd;nu++) { |  | ||||||
|                     if(nu==mu) continue; |  | ||||||
|                     int s = stencilIndex(mu,nu); |  | ||||||
|                     for(int rho=0;rho<Nd;rho++) { |  | ||||||
|                         if (rho == mu || rho == nu) continue; |  | ||||||
|  |  | ||||||
|                         SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu      = SE0->_offset; |  | ||||||
|                         SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu      = SE1->_offset; |  | ||||||
|                         SE2 = gStencil_v.GetEntry(s+2,site); int x           = SE2->_offset; |  | ||||||
|                         SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; |  | ||||||
|                         SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu      = SE4->_offset; |  | ||||||
|  |  | ||||||
|                         U0 = coalescedReadGeneralPermute(      U_v[x_p_mu     ](nu ),SE0->_permute,Nd); |  | ||||||
|                         U1 = coalescedReadGeneralPermute(U_3link_v[x_p_nu     ](rho),SE1->_permute,Nd); |  | ||||||
|                         U2 = coalescedReadGeneralPermute(      U_v[x          ](nu ),SE2->_permute,Nd); |  | ||||||
|                         U3 = coalescedReadGeneralPermute(      U_v[x_p_mu_m_nu](nu ),SE3->_permute,Nd); |  | ||||||
|                         U4 = coalescedReadGeneralPermute(U_3link_v[x_m_nu     ](rho),SE4->_permute,Nd); |  | ||||||
|                         U5 = coalescedReadGeneralPermute(      U_v[x_m_nu     ](nu ),SE4->_permute,Nd); |  | ||||||
|  |  | ||||||
|                         W  = U2*U1*adj(U0) + adj(U5)*U4*U3; |  | ||||||
|  |  | ||||||
|                         if(sigmaIndex<3) { |  | ||||||
|                             coalescedWrite(U_5linkA_v[x](rho), W); |  | ||||||
|                         } else { |  | ||||||
|                             coalescedWrite(U_5linkB_v[x](rho), W); |  | ||||||
|                         }     |  | ||||||
|  |  | ||||||
|                         coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_5*W); |  | ||||||
|                         sigmaIndex++; |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             }) |  | ||||||
|  |  | ||||||
|             accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 7-link |  | ||||||
|                 stencilElement SE0, SE1, SE2, SE3, SE4, SE5; |  | ||||||
|                 U3matrix U0, U1, U2, U3, U4, U5, W; |  | ||||||
|                 int sigmaIndex = 0; |  | ||||||
|                 for(int nu=0;nu<Nd;nu++) { |  | ||||||
|                     if(nu==mu) continue; |  | ||||||
|                     int s = stencilIndex(mu,nu); |  | ||||||
|                     for(int rho=0;rho<Nd;rho++) { |  | ||||||
|                         if (rho == mu || rho == nu) continue; |  | ||||||
|  |  | ||||||
|                         SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu      = SE0->_offset; |  | ||||||
|                         SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu      = SE1->_offset; |  | ||||||
|                         SE2 = gStencil_v.GetEntry(s+2,site); int x           = SE2->_offset; |  | ||||||
|                         SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset; |  | ||||||
|                         SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu      = SE4->_offset; |  | ||||||
|  |  | ||||||
|                         U0 = coalescedReadGeneralPermute(U_v[x_p_mu](nu),SE0->_permute,Nd); |  | ||||||
|                         if(sigmaIndex<3) { |  | ||||||
|                             U1 = coalescedReadGeneralPermute(U_5linkB_v[x_p_nu](rho),SE1->_permute,Nd); |  | ||||||
|                         } else { |  | ||||||
|                             U1 = coalescedReadGeneralPermute(U_5linkA_v[x_p_nu](rho),SE1->_permute,Nd); |  | ||||||
|                         }   |  | ||||||
|                         U2 = coalescedReadGeneralPermute(U_v[x](nu),SE2->_permute,Nd); |  | ||||||
|                         U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd); |  | ||||||
|                         if(sigmaIndex<3) { |  | ||||||
|                             U4 = coalescedReadGeneralPermute(U_5linkB_v[x_m_nu](rho),SE4->_permute,Nd); |  | ||||||
|                         } else { |  | ||||||
|                             U4 = coalescedReadGeneralPermute(U_5linkA_v[x_m_nu](rho),SE4->_permute,Nd); |  | ||||||
|                         }   |  | ||||||
|                         U5 = coalescedReadGeneralPermute(U_v[x_m_nu](nu),SE4->_permute,Nd); |  | ||||||
|  |  | ||||||
|                         W  = U2*U1*adj(U0) + adj(U5)*U4*U3; |  | ||||||
|  |  | ||||||
|                         coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_7*W); |  | ||||||
|                         sigmaIndex++; |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             }) |  | ||||||
|  |  | ||||||
|         } // end mu loop |  | ||||||
|  |  | ||||||
|         // c1, c3, c5, c7 construct contributions |  | ||||||
|         u_smr = Ghost.Extract(Ughost_fat) + lt.c_1*u_thin; |  | ||||||
|  |  | ||||||
|         // Load up U and V std::vectors to access thin and smeared links. |  | ||||||
|         std::vector<LF> U(Nd, grid); |  | ||||||
|         std::vector<LF> V(Nd, grid); |  | ||||||
|         std::vector<LF> Vnaik(Nd, grid); |  | ||||||
|         for (int mu = 0; mu < Nd; mu++) { |  | ||||||
|             U[mu] = PeekIndex<LorentzIndex>(u_thin, mu); |  | ||||||
|             V[mu] = PeekIndex<LorentzIndex>(u_smr, mu); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         for(int mu=0;mu<Nd;mu++) { |  | ||||||
|  |  | ||||||
|             // Naik |  | ||||||
|             Vnaik[mu] = lt.c_naik*Gimpl::CovShiftForward(U[mu],mu, |  | ||||||
|                                     Gimpl::CovShiftForward(U[mu],mu, |  | ||||||
|                                       Gimpl::CovShiftIdentityForward(U[mu],mu))); |  | ||||||
|  |  | ||||||
|             // LePage |  | ||||||
|             for (int nu_h=1;nu_h<Nd;nu_h++) { |  | ||||||
|                 int nu=(mu+nu_h)%Nd; |  | ||||||
|                                 // nu, nu, mu, Back(nu), Back(nu) |  | ||||||
|                 V[mu] = V[mu] + lt.c_lp*Gimpl::CovShiftForward(U[nu],nu, |  | ||||||
|                                           Gimpl::CovShiftForward(U[nu],nu, |  | ||||||
|                                             Gimpl::CovShiftForward(U[mu],mu, |  | ||||||
|                                               Gimpl::CovShiftBackward(U[nu],nu, |  | ||||||
|                                                 Gimpl::CovShiftIdentityBackward(U[nu],nu))))) |  | ||||||
|                                 // Back(nu), Back(nu), mu, nu, nu |  | ||||||
|                               + lt.c_lp*Gimpl::CovShiftBackward(U[nu],nu, |  | ||||||
|                                           Gimpl::CovShiftBackward(U[nu],nu, |  | ||||||
|                                             Gimpl::CovShiftForward(U[mu],mu, |  | ||||||
|                                               Gimpl::CovShiftForward(U[nu],nu, |  | ||||||
|                                                 Gimpl::CovShiftIdentityForward(U[nu],nu))))); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         // Put V back into u_smr. |  | ||||||
|         for (int mu = 0; mu < Nd; mu++) { |  | ||||||
|             PokeIndex<LorentzIndex>(u_smr , V[mu]    , mu); |  | ||||||
|             PokeIndex<LorentzIndex>(u_naik, Vnaik[mu], mu); |  | ||||||
|         } |  | ||||||
|     }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     // Intent: OUT--u_proj |  | ||||||
|     //          IN--u_mu |  | ||||||
|     void projectU3(GF& u_proj, GF& u_mu) const { |  | ||||||
|  |  | ||||||
|         auto grid = this->_grid; |  | ||||||
|  |  | ||||||
|         LF V(grid), Q(grid), sqrtQinv(grid), id_3(grid), diff(grid); |  | ||||||
|         CF c0(grid), c1(grid), c2(grid), g0(grid), g1(grid), g2(grid), S(grid), R(grid), theta(grid),  |  | ||||||
|            u(grid), v(grid), w(grid), den(grid), f0(grid), f1(grid), f2(grid); |  | ||||||
|  |  | ||||||
|         // Follow MILC 10.1103/PhysRevD.82.074501, eqs (B2-B3) and (C1-C8) |  | ||||||
|         for (int mu = 0; mu < Nd; mu++) { |  | ||||||
|             V  = PeekIndex<LorentzIndex>(u_mu, mu); |  | ||||||
|             Q  = adj(V)*V; |  | ||||||
|             c0 =        real(trace(Q)); |  | ||||||
|             c1 = (1/2.)*real(trace(Q*Q)); |  | ||||||
|             c2 = (1/3.)*real(trace(Q*Q*Q)); |  | ||||||
|             S  = (1/3.)*c1-(1/18.)*c0*c0; |  | ||||||
|             if (norm2(S)<1e-28) { |  | ||||||
|                 g0 = (1/3.)*c0; g1 = g0; g2 = g1; |  | ||||||
|             } else { |  | ||||||
|                 R     = (1/2.)*c2-(1/3. )*c0*c1+(1/27.)*c0*c0*c0; |  | ||||||
|                 theta = acos(R*pow(S,-1.5)); |  | ||||||
|                 g0    = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta-2*M_PI/3.); |  | ||||||
|                 g1    = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta          ); |  | ||||||
|                 g2    = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta+2*M_PI/3.); |  | ||||||
|             } |  | ||||||
| //            if (fabs(Q.determinant()/(g0*g1*g2)-1.0) > 1e-5) { SVD } |  | ||||||
|             u     = sqrt(g0) + sqrt(g1) + sqrt(g2); |  | ||||||
|             v     = sqrt(g0*g1) + sqrt(g0*g2) + sqrt(g1*g2); |  | ||||||
|             w     = sqrt(g0*g1*g2); |  | ||||||
|             den   = w*(u*v-w); |  | ||||||
|             f0    = (-w*(u*u+v)+u*v*v)/den; |  | ||||||
|             f1    = (-w-u*u*u+2.*u*v)/den; |  | ||||||
|             f2    = u/den; |  | ||||||
|             id_3  = 1.; |  | ||||||
|  |  | ||||||
|             sqrtQinv = f0*id_3 + f1*Q + f2*Q*Q; |  | ||||||
|  |  | ||||||
|             PokeIndex<LorentzIndex>(u_proj, V*sqrtQinv, mu); |  | ||||||
|         } |  | ||||||
|     }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
| //    void derivative(const GaugeField& Gauge) const { |  | ||||||
| //    }; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); |  | ||||||
| @@ -5,5 +5,4 @@ | |||||||
| #include <Grid/qcd/smearing/StoutSmearing.h> | #include <Grid/qcd/smearing/StoutSmearing.h> | ||||||
| #include <Grid/qcd/smearing/GaugeConfiguration.h> | #include <Grid/qcd/smearing/GaugeConfiguration.h> | ||||||
| #include <Grid/qcd/smearing/WilsonFlow.h> | #include <Grid/qcd/smearing/WilsonFlow.h> | ||||||
| #include <Grid/qcd/smearing/HISQSmearing.h> |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -69,7 +69,7 @@ public: | |||||||
|   /*! Construct stout smearing object from explicitly specified rho matrix */ |   /*! Construct stout smearing object from explicitly specified rho matrix */ | ||||||
|   Smear_Stout(const std::vector<double>& rho_) |   Smear_Stout(const std::vector<double>& rho_) | ||||||
|     : OwnedBase{new Smear_APE<Gimpl>(rho_)}, SmearBase{OwnedBase.get()} { |     : OwnedBase{new Smear_APE<Gimpl>(rho_)}, SmearBase{OwnedBase.get()} { | ||||||
|     std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl; |     std::cout << GridLogDebug << "Stout smearing constructor : Smear_Stout(const std::vector<double>& " << rho_ << " )" << std::endl | ||||||
|     assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3"); |     assert(Nc == 3 && "Stout smearing currently implemented only for Nc==3"); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -54,7 +54,361 @@ struct LaplacianParams : Serializable { | |||||||
|       precision(precision){}; |       precision(precision){}; | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | #define LEG_LOAD(Dir)						 \ | ||||||
|  |   SE = st.GetEntry(ptype, Dir, ss);				 \ | ||||||
|  |   if (SE->_is_local ) {						 \ | ||||||
|  |     int perm= SE->_permute;					 \ | ||||||
|  |     chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \ | ||||||
|  |   } else {							 \ | ||||||
|  |     chi = coalescedRead(buf[SE->_offset],lane);			 \ | ||||||
|  |   }								 \ | ||||||
|  |   acceleratorSynchronise(); | ||||||
|  |  | ||||||
|  | const std::vector<int> directions4D   ({Xdir,Ydir,Zdir,Tdir,Xdir,Ydir,Zdir,Tdir}); | ||||||
|  | const std::vector<int> displacements4D({1,1,1,1,-1,-1,-1,-1}); | ||||||
|  |  | ||||||
|  | template<class Gimpl,class Field> class CovariantAdjointLaplacianStencil : public SparseMatrixBase<Field> | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |   INHERIT_GIMPL_TYPES(Gimpl); | ||||||
|  | //  RealD kappa; | ||||||
|  |  | ||||||
|  |   typedef typename Field::vector_object siteObject; | ||||||
|  |  | ||||||
|  |   template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nc> >, Nds>; | ||||||
|  |   typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField; | ||||||
|  |   typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField; | ||||||
|  |   typedef CartesianStencil<siteObject, siteObject, DefaultImplParams> StencilImpl; | ||||||
|  |  | ||||||
|  |   GridBase *grid; | ||||||
|  |   StencilImpl Stencil; | ||||||
|  |   SimpleCompressor<siteObject> Compressor; | ||||||
|  |   DoubledGaugeField Uds; | ||||||
|  |  | ||||||
|  |   CovariantAdjointLaplacianStencil( GridBase *_grid) | ||||||
|  |     : grid(_grid), | ||||||
|  |       Stencil    (grid,8,Even,directions4D,displacements4D), | ||||||
|  |       Uds(grid){} | ||||||
|  |  | ||||||
|  |   CovariantAdjointLaplacianStencil(GaugeField &Umu) | ||||||
|  |     : | ||||||
|  |       grid(Umu.Grid()), | ||||||
|  |       Stencil    (grid,8,Even,directions4D,displacements4D), | ||||||
|  |       Uds(grid) | ||||||
|  |   { GaugeImport(Umu); } | ||||||
|  |  | ||||||
|  |   void GaugeImport (const GaugeField &Umu) | ||||||
|  |   { | ||||||
|  |     assert(grid == Umu.Grid()); | ||||||
|  |     for (int mu = 0; mu < Nd; mu++) { | ||||||
|  |       auto U = PeekIndex<LorentzIndex>(Umu, mu); | ||||||
|  |       PokeIndex<LorentzIndex>(Uds, U, mu ); | ||||||
|  |       U = adj(Cshift(U, mu, -1)); | ||||||
|  |       PokeIndex<LorentzIndex>(Uds, U, mu + 4); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |    | ||||||
|  |   virtual GridBase *Grid(void) { return grid; }; | ||||||
|  | //broken | ||||||
|  | #if 0 | ||||||
|  |   virtual void  MDeriv(const Field &_left, Field &_right,Field &_der, int mu) | ||||||
|  |   { | ||||||
|  |     /////////////////////////////////////////////// | ||||||
|  |     // Halo exchange for this geometry of stencil | ||||||
|  |     /////////////////////////////////////////////// | ||||||
|  |     Stencil.HaloExchange(_lef, Compressor); | ||||||
|  |  | ||||||
|  |     /////////////////////////////////// | ||||||
|  |     // Arithmetic expressions | ||||||
|  |     /////////////////////////////////// | ||||||
|  |     autoView( st     , Stencil    , AcceleratorRead); | ||||||
|  |     auto buf = st.CommBuf(); | ||||||
|  |  | ||||||
|  |     autoView( in     , _left    , AcceleratorRead); | ||||||
|  |     autoView( right    , _right   , AcceleratorRead); | ||||||
|  |     autoView( der    , _der   , AcceleratorWrite); | ||||||
|  |     autoView( U     , Uds    , AcceleratorRead); | ||||||
|  |  | ||||||
|  |     typedef typename Field::vector_object        vobj; | ||||||
|  |     typedef decltype(coalescedRead(left[0]))    calcObj; | ||||||
|  |     typedef decltype(coalescedRead(U[0](0))) calcLink; | ||||||
|  |  | ||||||
|  |     const int      Nsimd = vobj::Nsimd(); | ||||||
|  |     const uint64_t NN = grid->oSites(); | ||||||
|  |  | ||||||
|  |     accelerator_for( ss, NN, Nsimd, { | ||||||
|  |  | ||||||
|  | 	StencilEntry *SE; | ||||||
|  | 	 | ||||||
|  | 	const int lane=acceleratorSIMTlane(Nsimd); | ||||||
|  |  | ||||||
|  | 	calcObj chi; | ||||||
|  | 	calcObj phi; | ||||||
|  | 	calcObj res; | ||||||
|  | 	calcObj Uchi; | ||||||
|  | 	calcObj Utmp; | ||||||
|  | 	calcObj Utmp2; | ||||||
|  | 	calcLink UU; | ||||||
|  | 	calcLink Udag; | ||||||
|  | 	int ptype; | ||||||
|  |  | ||||||
|  | 	res                 = coalescedRead(def[ss]); | ||||||
|  | 	phi                 = coalescedRead(right[ss]); | ||||||
|  |  | ||||||
|  | #define LEG_LOAD_MULT_LINK(leg,polarisation)			\ | ||||||
|  | 	UU = coalescedRead(U[ss](polarisation));	\ | ||||||
|  | 	Udag = adj(UU);					\ | ||||||
|  | 	LEG_LOAD(leg);					\ | ||||||
|  | 	mult(&Utmp(), &UU, &chi());			\ | ||||||
|  | 	Utmp2 = adj(Utmp);				\ | ||||||
|  | 	mult(&Utmp(), &UU, &Utmp2());			\ | ||||||
|  | 	Utmp2 = adj(Utmp);				\ | ||||||
|  | 	mult(&Uchi(), &phi(), &Utmp2());			\ | ||||||
|  | 	res = res + Uchi; | ||||||
|  | 	 | ||||||
|  | 	LEG_LOAD_MULT_LINK(0,Xp); | ||||||
|  | 	LEG_LOAD_MULT_LINK(1,Yp); | ||||||
|  | 	LEG_LOAD_MULT_LINK(2,Zp); | ||||||
|  | 	LEG_LOAD_MULT_LINK(3,Tp); | ||||||
|  |  | ||||||
|  | 	coalescedWrite(der[ss], res,lane); | ||||||
|  |     }); | ||||||
|  |  | ||||||
|  |   }; | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |   virtual void  Morig(const Field &_in, Field &_out) | ||||||
|  |   { | ||||||
|  |     /////////////////////////////////////////////// | ||||||
|  |     // Halo exchange for this geometry of stencil | ||||||
|  |     /////////////////////////////////////////////// | ||||||
|  |     Stencil.HaloExchange(_in, Compressor); | ||||||
|  |  | ||||||
|  |     /////////////////////////////////// | ||||||
|  |     // Arithmetic expressions | ||||||
|  |     /////////////////////////////////// | ||||||
|  | //    auto st = Stencil.View(AcceleratorRead); | ||||||
|  |     autoView( st     , Stencil    , AcceleratorRead); | ||||||
|  |     auto buf = st.CommBuf(); | ||||||
|  |  | ||||||
|  |     autoView( in     , _in    , AcceleratorRead); | ||||||
|  |     autoView( out    , _out   , AcceleratorWrite); | ||||||
|  |     autoView( U     , Uds    , AcceleratorRead); | ||||||
|  |  | ||||||
|  |     typedef typename Field::vector_object        vobj; | ||||||
|  |     typedef decltype(coalescedRead(in[0]))    calcObj; | ||||||
|  |     typedef decltype(coalescedRead(U[0](0))) calcLink; | ||||||
|  |  | ||||||
|  |     const int      Nsimd = vobj::Nsimd(); | ||||||
|  |     const uint64_t NN = grid->oSites(); | ||||||
|  |  | ||||||
|  |     accelerator_for( ss, NN, Nsimd, { | ||||||
|  |  | ||||||
|  | 	StencilEntry *SE; | ||||||
|  | 	 | ||||||
|  | 	const int lane=acceleratorSIMTlane(Nsimd); | ||||||
|  |  | ||||||
|  | 	calcObj chi; | ||||||
|  | 	calcObj res; | ||||||
|  | 	calcObj Uchi; | ||||||
|  | 	calcObj Utmp; | ||||||
|  | 	calcObj Utmp2; | ||||||
|  | 	calcLink UU; | ||||||
|  | 	calcLink Udag; | ||||||
|  | 	int ptype; | ||||||
|  |  | ||||||
|  | 	res                 = coalescedRead(in[ss])*(-8.0); | ||||||
|  |  | ||||||
|  | #define LEG_LOAD_MULT(leg,polarisation)			\ | ||||||
|  | 	UU = coalescedRead(U[ss](polarisation));	\ | ||||||
|  | 	Udag = adj(UU);					\ | ||||||
|  | 	LEG_LOAD(leg);					\ | ||||||
|  | 	mult(&Utmp(), &UU, &chi());			\ | ||||||
|  | 	Utmp2 = adj(Utmp);				\ | ||||||
|  | 	mult(&Utmp(), &UU, &Utmp2());			\ | ||||||
|  | 	Uchi = adj(Utmp);				\ | ||||||
|  | 	res = res + Uchi; | ||||||
|  | 	 | ||||||
|  | 	LEG_LOAD_MULT(0,Xp); | ||||||
|  | 	LEG_LOAD_MULT(1,Yp); | ||||||
|  | 	LEG_LOAD_MULT(2,Zp); | ||||||
|  | 	LEG_LOAD_MULT(3,Tp); | ||||||
|  | 	LEG_LOAD_MULT(4,Xm); | ||||||
|  | 	LEG_LOAD_MULT(5,Ym); | ||||||
|  | 	LEG_LOAD_MULT(6,Zm); | ||||||
|  | 	LEG_LOAD_MULT(7,Tm); | ||||||
|  |  | ||||||
|  | 	coalescedWrite(out[ss], res,lane); | ||||||
|  |     }); | ||||||
|  |  | ||||||
|  |   }; | ||||||
|  |   virtual void  Mnew (const Field &_in, Field &_out) | ||||||
|  |   { | ||||||
|  |     /////////////////////////////////////////////// | ||||||
|  |     // Halo exchange for this geometry of stencil | ||||||
|  |     /////////////////////////////////////////////// | ||||||
|  | //    Stencil.HaloExchange(_in, Compressor); | ||||||
|  |       std::vector<std::vector<CommsRequest_t> > requests; | ||||||
|  |       Stencil.Prepare(); | ||||||
|  |   { | ||||||
|  |     GRID_TRACE("Laplace Gather"); | ||||||
|  |     Stencil.HaloGather(_in,Compressor); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   tracePush("Laplace Communication"); | ||||||
|  |   Stencil.CommunicateBegin(requests); | ||||||
|  |   { | ||||||
|  |     GRID_TRACE("MergeSHM"); | ||||||
|  |     Stencil.CommsMergeSHM(Compressor); | ||||||
|  |   } | ||||||
|  |      | ||||||
|  |  | ||||||
|  |     /////////////////////////////////// | ||||||
|  |     // Arithmetic expressions | ||||||
|  |     /////////////////////////////////// | ||||||
|  | //    auto st = Stencil.View(AcceleratorRead); | ||||||
|  |     autoView( st     , Stencil    , AcceleratorRead); | ||||||
|  |     auto buf = st.CommBuf(); | ||||||
|  |  | ||||||
|  |     autoView( in     , _in    , AcceleratorRead); | ||||||
|  |     autoView( out    , _out   , AcceleratorWrite); | ||||||
|  |     autoView( U     , Uds    , AcceleratorRead); | ||||||
|  |  | ||||||
|  |     typedef typename Field::vector_object        vobj; | ||||||
|  |     typedef decltype(coalescedRead(in[0]))    calcObj; | ||||||
|  |     typedef decltype(coalescedRead(U[0](0))) calcLink; | ||||||
|  |  | ||||||
|  |     const int      Nsimd = vobj::Nsimd(); | ||||||
|  |     const uint64_t NN = grid->oSites(); | ||||||
|  |  | ||||||
|  |     accelerator_for( ss, NN, Nsimd, { | ||||||
|  |  | ||||||
|  | 	StencilEntry *SE; | ||||||
|  | 	 | ||||||
|  | 	const int lane=acceleratorSIMTlane(Nsimd); | ||||||
|  |  | ||||||
|  | 	calcObj chi; | ||||||
|  | 	calcObj res; | ||||||
|  | 	calcObj Uchi; | ||||||
|  | 	calcObj Utmp; | ||||||
|  | 	calcObj Utmp2; | ||||||
|  | 	calcLink UU; | ||||||
|  | 	calcLink Udag; | ||||||
|  | 	int ptype; | ||||||
|  |  | ||||||
|  | 	res                 = coalescedRead(in[ss])*(-8.0); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         SE = st.GetEntry(ptype, 0, ss);				  | ||||||
|  |         if (SE->_is_local ) { | ||||||
|  | 	LEG_LOAD_MULT(0,Xp); | ||||||
|  | 	} | ||||||
|  |         SE = st.GetEntry(ptype, 1, ss);				  | ||||||
|  |         if (SE->_is_local ) { | ||||||
|  | 	LEG_LOAD_MULT(1,Yp); | ||||||
|  | 	} | ||||||
|  |         SE = st.GetEntry(ptype, 2, ss);				  | ||||||
|  |         if (SE->_is_local ) { | ||||||
|  | 	LEG_LOAD_MULT(2,Zp); | ||||||
|  | 	} | ||||||
|  |         SE = st.GetEntry(ptype, 3, ss);				  | ||||||
|  |         if (SE->_is_local ) { | ||||||
|  | 	LEG_LOAD_MULT(3,Tp); | ||||||
|  | 	} | ||||||
|  |         SE = st.GetEntry(ptype, 4, ss);				  | ||||||
|  |         if (SE->_is_local ) { | ||||||
|  | 	LEG_LOAD_MULT(4,Xm); | ||||||
|  | 	} | ||||||
|  |         SE = st.GetEntry(ptype, 5, ss);				  | ||||||
|  |         if (SE->_is_local ) { | ||||||
|  | 	LEG_LOAD_MULT(5,Ym); | ||||||
|  | 	} | ||||||
|  |         SE = st.GetEntry(ptype, 6, ss);				  | ||||||
|  |         if (SE->_is_local ) { | ||||||
|  | 	LEG_LOAD_MULT(6,Zm); | ||||||
|  | 	} | ||||||
|  |         SE = st.GetEntry(ptype, 7, ss);				  | ||||||
|  |         if (SE->_is_local ) { | ||||||
|  | 	LEG_LOAD_MULT(7,Tm); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	coalescedWrite(out[ss], res,lane); | ||||||
|  |     }); | ||||||
|  |  | ||||||
|  |     Stencil.CommunicateComplete(requests); | ||||||
|  |   tracePop("Communication"); | ||||||
|  |  | ||||||
|  |   { | ||||||
|  |     GRID_TRACE("Merge"); | ||||||
|  |     Stencil.CommsMerge(Compressor); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     accelerator_for( ss, NN, Nsimd, { | ||||||
|  |  | ||||||
|  | 	StencilEntry *SE; | ||||||
|  | 	 | ||||||
|  | 	const int lane=acceleratorSIMTlane(Nsimd); | ||||||
|  |  | ||||||
|  | 	calcObj chi; | ||||||
|  | 	calcObj res; | ||||||
|  | 	calcObj Uchi; | ||||||
|  | 	calcObj Utmp; | ||||||
|  | 	calcObj Utmp2; | ||||||
|  | 	calcLink UU; | ||||||
|  | 	calcLink Udag; | ||||||
|  | 	int ptype; | ||||||
|  |  | ||||||
|  | //	res                 = coalescedRead(in[ss])*(-8.0); | ||||||
|  | 	res                 = coalescedRead(out[ss]); | ||||||
|  |  | ||||||
|  |         SE = st.GetEntry(ptype, 0, ss);				  | ||||||
|  |         if ((SE->_is_local )==0){ | ||||||
|  | 	LEG_LOAD_MULT(0,Xp); | ||||||
|  | 	} | ||||||
|  |         SE = st.GetEntry(ptype, 1, ss);				  | ||||||
|  |         if ((SE->_is_local )==0){ | ||||||
|  | 	LEG_LOAD_MULT(1,Yp); | ||||||
|  | 	} | ||||||
|  |         SE = st.GetEntry(ptype, 2, ss);				  | ||||||
|  |         if ((SE->_is_local )==0){ | ||||||
|  | 	LEG_LOAD_MULT(2,Zp); | ||||||
|  | 	} | ||||||
|  |         SE = st.GetEntry(ptype, 3, ss); | ||||||
|  |         if ((SE->_is_local )==0){ | ||||||
|  | 	LEG_LOAD_MULT(3,Tp); | ||||||
|  | 	} | ||||||
|  |         SE = st.GetEntry(ptype, 4, ss); | ||||||
|  |         if ((SE->_is_local )==0){ | ||||||
|  | 	LEG_LOAD_MULT(4,Xm); | ||||||
|  | 	} | ||||||
|  |         SE = st.GetEntry(ptype, 5, ss); | ||||||
|  |         if ((SE->_is_local )==0){ | ||||||
|  | 	LEG_LOAD_MULT(5,Ym); | ||||||
|  | 	} | ||||||
|  |         SE = st.GetEntry(ptype, 6, ss); | ||||||
|  |         if ((SE->_is_local )==0){ | ||||||
|  | 	LEG_LOAD_MULT(6,Zm); | ||||||
|  | 	} | ||||||
|  |         SE = st.GetEntry(ptype, 7, ss); | ||||||
|  |         if ((SE->_is_local )==0){ | ||||||
|  | 	LEG_LOAD_MULT(7,Tm); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	coalescedWrite(out[ss], res,lane); | ||||||
|  |     }); | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   virtual void  M(const Field &in, Field &out) {Mnew(in,out);}; | ||||||
|  |   virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian | ||||||
|  |   virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid | ||||||
|  |   virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid | ||||||
|  |   virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | #undef LEG_LOAD_MULT | ||||||
|  | #undef LEG_LOAD_MULT_LINK | ||||||
|  | #undef LEG_LOAD | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////// | ||||||
| // Laplacian operator L on adjoint fields | // Laplacian operator L on adjoint fields | ||||||
| @@ -76,29 +430,40 @@ class LaplacianAdjointField: public Metric<typename Impl::Field> { | |||||||
|   LaplacianParams param; |   LaplacianParams param; | ||||||
|   MultiShiftFunction PowerHalf;     |   MultiShiftFunction PowerHalf;     | ||||||
|   MultiShiftFunction PowerInvHalf;     |   MultiShiftFunction PowerInvHalf;     | ||||||
|  | //template<class Gimpl,class Field> class CovariantAdjointLaplacianStencil : public SparseMatrixBase<Field> | ||||||
|  |   CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField> LapStencil; | ||||||
|  |  | ||||||
| public: | public: | ||||||
|   INHERIT_GIMPL_TYPES(Impl); |   INHERIT_GIMPL_TYPES(Impl); | ||||||
|  |  | ||||||
|   LaplacianAdjointField(GridBase* grid, OperatorFunction<GaugeField>& S, LaplacianParams& p, const RealD k = 1.0) |   LaplacianAdjointField(GridBase* grid, OperatorFunction<GaugeField>& S, LaplacianParams& p, const RealD k = 1.0, bool if_remez=true) | ||||||
|     : U(Nd, grid), Solver(S), param(p), kappa(k){ |     : U(Nd, grid), Solver(S), param(p), kappa(k) | ||||||
|  | 	,LapStencil(grid){ | ||||||
|     AlgRemez remez(param.lo,param.hi,param.precision); |     AlgRemez remez(param.lo,param.hi,param.precision); | ||||||
|     std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl; |     std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl; | ||||||
|  |     if(if_remez){ | ||||||
|     remez.generateApprox(param.degree,1,2); |     remez.generateApprox(param.degree,1,2); | ||||||
|     PowerHalf.Init(remez,param.tolerance,false); |     PowerHalf.Init(remez,param.tolerance,false); | ||||||
|     PowerInvHalf.Init(remez,param.tolerance,true); |     PowerInvHalf.Init(remez,param.tolerance,true); | ||||||
|  |     } | ||||||
|  |     this->triv=0; | ||||||
|          |          | ||||||
|  |  | ||||||
|   }; |   }; | ||||||
|  |   LaplacianAdjointField(){this->triv=0; printf("triv=%d\n",this->Trivial());} | ||||||
|   void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);} |   void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);} | ||||||
|   void MdirAll(const GaugeField&, std::vector<GaugeField> &){ assert(0);} |   void MdirAll(const GaugeField&, std::vector<GaugeField> &){ assert(0);} | ||||||
|   void Mdiag(const GaugeField&, GaugeField&){ assert(0);} |   void Mdiag(const GaugeField&, GaugeField&){ assert(0);} | ||||||
|  |  | ||||||
|   void ImportGauge(const GaugeField& _U) { |   void ImportGauge(const GaugeField& _U) { | ||||||
|  |     RealD total=0.; | ||||||
|     for (int mu = 0; mu < Nd; mu++) { |     for (int mu = 0; mu < Nd; mu++) { | ||||||
|       U[mu] = PeekIndex<LorentzIndex>(_U, mu); |       U[mu] = PeekIndex<LorentzIndex>(_U, mu); | ||||||
|  |       total += norm2(U[mu]); | ||||||
|     } |     } | ||||||
|  |     LapStencil.GaugeImport (_U); | ||||||
|  |  | ||||||
|  |     std::cout << GridLogDebug <<"ImportGauge:norm2(U _U) = "<<total<<std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void M(const GaugeField& in, GaugeField& out) { |   void M(const GaugeField& in, GaugeField& out) { | ||||||
| @@ -106,10 +471,12 @@ public: | |||||||
|     // test |     // test | ||||||
|     //GaugeField herm = in + adj(in); |     //GaugeField herm = in + adj(in); | ||||||
|     //std::cout << "AHermiticity: " << norm2(herm) << std::endl; |     //std::cout << "AHermiticity: " << norm2(herm) << std::endl; | ||||||
|  | //    std::cout << GridLogDebug <<"M:Kappa = "<<kappa<<std::endl; | ||||||
|  |  | ||||||
|  |     GaugeLinkField sum(in.Grid()); | ||||||
|  | #if 0 | ||||||
|     GaugeLinkField tmp(in.Grid()); |     GaugeLinkField tmp(in.Grid()); | ||||||
|     GaugeLinkField tmp2(in.Grid()); |     GaugeLinkField tmp2(in.Grid()); | ||||||
|     GaugeLinkField sum(in.Grid()); |  | ||||||
|  |  | ||||||
|     for (int nu = 0; nu < Nd; nu++) { |     for (int nu = 0; nu < Nd; nu++) { | ||||||
|       sum = Zero(); |       sum = Zero(); | ||||||
| @@ -123,10 +490,22 @@ public: | |||||||
|       out_nu = (1.0 - kappa) * in_nu - kappa / (double(4 * Nd)) * sum; |       out_nu = (1.0 - kappa) * in_nu - kappa / (double(4 * Nd)) * sum; | ||||||
|       PokeIndex<LorentzIndex>(out, out_nu, nu); |       PokeIndex<LorentzIndex>(out, out_nu, nu); | ||||||
|     } |     } | ||||||
|  | #else | ||||||
|  |     for (int nu = 0; nu < Nd; nu++) { | ||||||
|  |       GaugeLinkField in_nu = PeekIndex<LorentzIndex>(in, nu); | ||||||
|  |       GaugeLinkField out_nu(out.Grid()); | ||||||
|  |       LapStencil.M(in_nu,sum); | ||||||
|  |       out_nu = (1.0 - kappa) * in_nu - kappa / (double(4 * Nd)) * sum; | ||||||
|  |       PokeIndex<LorentzIndex>(out, out_nu, nu); | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|  | //    std::cout << GridLogDebug <<"M:norm2(out) = "<<norm2(out)<<std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   void MDeriv(const GaugeField& in, GaugeField& der) { |   void MDeriv(const GaugeField& in, GaugeField& der) { | ||||||
|     // in is anti-hermitian |     // in is anti-hermitian | ||||||
|  | //    std::cout << GridLogDebug <<"MDeriv:Kappa = "<<kappa<<std::endl; | ||||||
|     RealD factor = -kappa / (double(4 * Nd)); |     RealD factor = -kappa / (double(4 * Nd)); | ||||||
|      |      | ||||||
|     for (int mu = 0; mu < Nd; mu++){ |     for (int mu = 0; mu < Nd; mu++){ | ||||||
| @@ -140,6 +519,7 @@ public: | |||||||
|       // adjoint in the last multiplication |       // adjoint in the last multiplication | ||||||
|       PokeIndex<LorentzIndex>(der,  -2.0 * factor * der_mu, mu); |       PokeIndex<LorentzIndex>(der,  -2.0 * factor * der_mu, mu); | ||||||
|     }  |     }  | ||||||
|  |     std::cout << GridLogDebug <<"MDeriv: Kappa= "<< kappa << " norm2(der) = "<<norm2(der)<<std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   // separating this temporarily |   // separating this temporarily | ||||||
| @@ -159,11 +539,22 @@ public: | |||||||
|       } |       } | ||||||
|       PokeIndex<LorentzIndex>(der, -factor * der_mu, mu); |       PokeIndex<LorentzIndex>(der, -factor * der_mu, mu); | ||||||
|     } |     } | ||||||
|  |     std::cout << GridLogDebug <<"MDeriv: Kappa= "<< kappa << " norm2(der) = "<<norm2(der)<<std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void Minv(const GaugeField& in, GaugeField& inverted){ |   void Minv(const GaugeField& in, GaugeField& inverted){ | ||||||
|     HermitianLinearOperator<LaplacianAdjointField<Impl>,GaugeField> HermOp(*this); |     HermitianLinearOperator<LaplacianAdjointField<Impl>,GaugeField> HermOp(*this); | ||||||
|     Solver(HermOp, in, inverted); |     Solver(HermOp, in, inverted); | ||||||
|  |     std::cout << GridLogDebug <<"Minv:norm2(inverted) = "<<norm2(inverted)<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   void MinvDeriv(const GaugeField& in, GaugeField& der) { | ||||||
|  |     GaugeField X(in.Grid()); | ||||||
|  |     Minv(in,X); | ||||||
|  |     MDeriv(X,der); | ||||||
|  |     der *=-1.0; | ||||||
|  |     std::cout << GridLogDebug <<"MinvDeriv:norm2(der) = "<<norm2(der)<<std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void MSquareRoot(GaugeField& P){ |   void MSquareRoot(GaugeField& P){ | ||||||
| @@ -172,6 +563,7 @@ public: | |||||||
|     ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerHalf); |     ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerHalf); | ||||||
|     msCG(HermOp,P,Gp); |     msCG(HermOp,P,Gp); | ||||||
|     P = Gp;  |     P = Gp;  | ||||||
|  |     std::cout << GridLogDebug <<"MSquareRoot:norm2(P) = "<<norm2(P)<<std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void MInvSquareRoot(GaugeField& P){ |   void MInvSquareRoot(GaugeField& P){ | ||||||
| @@ -180,6 +572,7 @@ public: | |||||||
|     ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerInvHalf); |     ConjugateGradientMultiShift<GaugeField> msCG(param.MaxIter,PowerInvHalf); | ||||||
|     msCG(HermOp,P,Gp); |     msCG(HermOp,P,Gp); | ||||||
|     P = Gp;  |     P = Gp;  | ||||||
|  |     std::cout << GridLogDebug <<"MInvSquareRoot:norm2(P) = "<<norm2(P)<<std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										403
									
								
								Grid/qcd/utils/CovariantLaplacianRat.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										403
									
								
								Grid/qcd/utils/CovariantLaplacianRat.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,403 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  | Grid physics library, www.github.com/paboyle/Grid | ||||||
|  |  | ||||||
|  | Source file: ./lib/qcd/action/scalar/CovariantLaplacianRat.h | ||||||
|  |  | ||||||
|  | Copyright (C) 2021 | ||||||
|  |  | ||||||
|  | Author: Chulwoo Jung <chulwoo@bnl.gov> | ||||||
|  |  | ||||||
|  | This program is free software; you can redistribute it and/or modify | ||||||
|  | it under the terms of the GNU General Public License as published by | ||||||
|  | the Free Software Foundation; either version 2 of the License, or | ||||||
|  | (at your option) any later version. | ||||||
|  |  | ||||||
|  | This program is distributed in the hope that it will be useful, | ||||||
|  | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | GNU General Public License for more details. | ||||||
|  |  | ||||||
|  | You should have received a copy of the GNU General Public License along | ||||||
|  | with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  | See the full license in the file "LICENSE" in the top level distribution | ||||||
|  | directory | ||||||
|  | *************************************************************************************/ | ||||||
|  | 			   /*  END LEGAL */ | ||||||
|  | #pragma once  | ||||||
|  | #define MIXED_CG | ||||||
|  | //enable/disable push_back | ||||||
|  | #undef USE_CHRONO  | ||||||
|  |  | ||||||
|  | //#include <roctracer/roctx.h> | ||||||
|  |  | ||||||
|  | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
|  | struct LaplacianRatParams { | ||||||
|  |  | ||||||
|  |   RealD offset; | ||||||
|  |   int order; | ||||||
|  |   std::vector<RealD> a0; | ||||||
|  |   std::vector<RealD> a1; | ||||||
|  |   std::vector<RealD> b0; | ||||||
|  |   std::vector<RealD> b1; | ||||||
|  |   RealD b2; //for debugging | ||||||
|  |   int   MaxIter; | ||||||
|  |   RealD tolerance; | ||||||
|  |   int   precision; | ||||||
|  |    | ||||||
|  |   // constructor  | ||||||
|  |   LaplacianRatParams(int ord = 1, | ||||||
|  |                   int maxit     = 1000, | ||||||
|  |                   RealD tol     = 1.0e-8,  | ||||||
|  |                   int precision = 64) | ||||||
|  |     : offset(1.), order(ord),b2(1.), | ||||||
|  |       MaxIter(maxit), | ||||||
|  |       tolerance(tol), | ||||||
|  |       precision(precision){  | ||||||
|  |       a0.resize(ord,0.); | ||||||
|  |       a1.resize(ord,0.); | ||||||
|  |       b0.resize(ord,0.); | ||||||
|  |       b1.resize(ord,0.); | ||||||
|  |       }; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | //////////////////////////////////////////////////////////// | ||||||
|  | // Laplacian operator L on adjoint fields | ||||||
|  | // | ||||||
|  | // phi: adjoint field | ||||||
|  | // L: D_mu^dag D_mu | ||||||
|  | // | ||||||
|  | // L phi(x) = Sum_mu [ U_mu(x)phi(x+mu)U_mu(x)^dag +  | ||||||
|  | //                     U_mu(x-mu)^dag phi(x-mu)U_mu(x-mu) | ||||||
|  | //                     -2phi(x)] | ||||||
|  | // | ||||||
|  | // Operator designed to be encapsulated by | ||||||
|  | // an HermitianLinearOperator<.. , ..> | ||||||
|  | //////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|  | template <class Impl, class ImplF> | ||||||
|  | class LaplacianAdjointRat: public Metric<typename Impl::Field> { | ||||||
|  |   OperatorFunction<typename Impl::Field> &Solver; | ||||||
|  |   LaplacianRatParams Gparam; | ||||||
|  |   LaplacianRatParams Mparam; | ||||||
|  |   GridBase *grid; | ||||||
|  |   GridBase *grid_f; | ||||||
|  |   CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField> LapStencil; | ||||||
|  |   CovariantAdjointLaplacianStencil<ImplF,typename ImplF::LinkField> LapStencilF; | ||||||
|  | public: | ||||||
|  |   INHERIT_GIMPL_TYPES(Impl); | ||||||
|  | //   typedef typename GImpl::LinkField GaugeLinkField; \ | ||||||
|  | //  typedef typename GImpl::Field GaugeField;          | ||||||
|  |   typedef typename ImplF::Field GaugeFieldF; | ||||||
|  |   typedef typename ImplF::LinkField GaugeLinkFieldF; \ | ||||||
|  |   GaugeField Usav; | ||||||
|  |   GaugeFieldF UsavF; | ||||||
|  |   std::vector< std::vector<GaugeLinkField> > prev_solnsM; | ||||||
|  |   std::vector< std::vector<GaugeLinkField> > prev_solnsMinv; | ||||||
|  |   std::vector< std::vector<GaugeLinkField> > prev_solnsMDeriv; | ||||||
|  |   std::vector< std::vector<GaugeLinkField> > prev_solnsMinvDeriv; | ||||||
|  |  | ||||||
|  | 	  LaplacianAdjointRat(GridBase* _grid, GridBase* _grid_f, OperatorFunction<GaugeField>& S, LaplacianRatParams& gpar, LaplacianRatParams& mpar) | ||||||
|  |     : grid(_grid),grid_f(_grid_f), LapStencil(_grid), LapStencilF(_grid_f), U(Nd, _grid), Solver(S), Gparam(gpar), Mparam(mpar),Usav(_grid), UsavF(_grid_f), | ||||||
|  |       prev_solnsM(4),prev_solnsMinv(4),prev_solnsMDeriv(4),prev_solnsMinvDeriv(4) { | ||||||
|  | //    std::cout<<GridLogMessage << "Generating degree "<<param.degree<<" for x^(1/2)"<<std::endl; | ||||||
|  |     this->triv=0; | ||||||
|  |          | ||||||
|  |  | ||||||
|  |   }; | ||||||
|  |   LaplacianAdjointRat(){this->triv=0; printf("triv=%d\n",this->Trivial());} | ||||||
|  |   void Mdir(const GaugeField&, GaugeField&, int, int){ assert(0);} | ||||||
|  |   void MdirAll(const GaugeField&, std::vector<GaugeField> &){ assert(0);} | ||||||
|  |   void Mdiag(const GaugeField&, GaugeField&){ assert(0);} | ||||||
|  |  | ||||||
|  |   void ImportGauge(const GaugeField& _U) { | ||||||
|  |     RealD total=0.; | ||||||
|  |     for (int mu = 0; mu < Nd; mu++) { | ||||||
|  |       U[mu] = PeekIndex<LorentzIndex>(_U, mu); | ||||||
|  |       total += norm2(U[mu]); | ||||||
|  |     } | ||||||
|  |     Usav = _U; | ||||||
|  |     precisionChange(UsavF,Usav); | ||||||
|  |     std::cout <<GridLogDebug << "ImportGauge:norm2(_U) = "<<" "<<total<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void MDerivLink(const GaugeLinkField& left, const GaugeLinkField& right, | ||||||
|  |               GaugeField& der) { | ||||||
|  |     std::cout<<GridLogMessage << "MDerivLink start "<< std::endl; | ||||||
|  |     RealD factor = -1. / (double(4 * Nd)); | ||||||
|  |     for (int mu = 0; mu < Nd; mu++) { | ||||||
|  |       GaugeLinkField der_mu(der.Grid()); | ||||||
|  |       der_mu = Zero(); | ||||||
|  | //      for (int nu = 0; nu < Nd; nu++) { | ||||||
|  | //        GaugeLinkField left_nu = PeekIndex<LorentzIndex>(left, nu); | ||||||
|  | //        GaugeLinkField right_nu = PeekIndex<LorentzIndex>(right, nu); | ||||||
|  |         der_mu += U[mu] * Cshift(left, mu, 1) * adj(U[mu]) * right; | ||||||
|  |         der_mu += U[mu] * Cshift(right, mu, 1) * adj(U[mu]) * left; | ||||||
|  | //      } | ||||||
|  |       PokeIndex<LorentzIndex>(der, -factor * der_mu, mu); | ||||||
|  |     } | ||||||
|  | //    std::cout << GridLogDebug <<"MDerivLink:  norm2(der) = "<<norm2(der)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "MDerivLink end "<< std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void MDerivLink(const GaugeLinkField& left, const GaugeLinkField& right, | ||||||
|  |               std::vector<GaugeLinkField> & der) { | ||||||
|  | //    std::cout<<GridLogMessage << "MDerivLink "<< std::endl; | ||||||
|  |     RealD factor = -1. / (double(4 * Nd)); | ||||||
|  |  | ||||||
|  |     for (int mu = 0; mu < Nd; mu++) { | ||||||
|  |       GaugeLinkField der_mu(left.Grid()); | ||||||
|  |       der_mu = Zero(); | ||||||
|  |         der_mu += U[mu] * Cshift(left, mu, 1) * adj(U[mu]) * right; | ||||||
|  |         der_mu += U[mu] * Cshift(right, mu, 1) * adj(U[mu]) * left; | ||||||
|  | //      PokeIndex<LorentzIndex>(der, -factor * der_mu, mu); | ||||||
|  |       der[mu] = -factor*der_mu; | ||||||
|  | //      std::cout << GridLogDebug <<"MDerivLink:  norm2(der) = "<<norm2(der[mu])<<std::endl; | ||||||
|  |          | ||||||
|  |     } | ||||||
|  | //    std::cout<<GridLogMessage << "MDerivLink end "<< std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void MDerivInt(LaplacianRatParams &par, const GaugeField& left, const GaugeField& right, | ||||||
|  |               GaugeField& der ,  std::vector< std::vector<GaugeLinkField> >& prev_solns ) { | ||||||
|  |  | ||||||
|  | // get rid of this please | ||||||
|  |     std::cout<<GridLogMessage << "LaplaceStart " <<std::endl; | ||||||
|  |     RealD fac =  - 1. / (double(4 * Nd)) ; | ||||||
|  |     RealD coef=0.5; | ||||||
|  |     LapStencil.GaugeImport(Usav); | ||||||
|  |     LapStencilF.GaugeImport(UsavF); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     for (int nu=0;nu<Nd;nu++){ | ||||||
|  |         GaugeLinkField right_nu = PeekIndex<LorentzIndex>(right, nu); | ||||||
|  |         GaugeLinkField left_nu = PeekIndex<LorentzIndex>(left, nu); | ||||||
|  |         GaugeLinkField LMinvMom(left.Grid()); | ||||||
|  |      | ||||||
|  |         GaugeLinkField GMom(left.Grid()); | ||||||
|  |         GaugeLinkField LMinvGMom(left.Grid()); | ||||||
|  |      | ||||||
|  |         GaugeLinkField AGMom(left.Grid()); | ||||||
|  |         GaugeLinkField MinvAGMom(left.Grid()); | ||||||
|  |         GaugeLinkField LMinvAGMom(left.Grid()); | ||||||
|  |      | ||||||
|  |         GaugeLinkField AMinvMom(left.Grid()); | ||||||
|  |         GaugeLinkField LMinvAMom(left.Grid()); | ||||||
|  |         GaugeLinkField temp(left.Grid()); | ||||||
|  |         GaugeLinkField temp2(left.Grid()); | ||||||
|  |      | ||||||
|  |         std::vector<GaugeLinkField> MinvMom(par.order,left.Grid()); | ||||||
|  |      | ||||||
|  |         GaugeLinkField MinvGMom(left.Grid()); | ||||||
|  |         GaugeLinkField Gtemp(left.Grid()); | ||||||
|  |         GaugeLinkField Gtemp2(left.Grid()); | ||||||
|  |      | ||||||
|  |      | ||||||
|  |         ConjugateGradient<GaugeLinkField> CG(par.tolerance,10000,false); | ||||||
|  |     //    ConjugateGradient<GaugeFieldF> CG_f(par.tolerance,10000,false); | ||||||
|  |         LaplacianParams LapPar(0.0001, 1.0, 10000, 1e-8, 12, 64); | ||||||
|  |      | ||||||
|  |         ChronoForecast< QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,GaugeLinkField>,GaugeLinkField> , GaugeLinkField> Forecast; | ||||||
|  |      | ||||||
|  |         GMom = par.offset * right_nu; | ||||||
|  |      | ||||||
|  |         for(int i =0;i<par.order;i++){ | ||||||
|  |         QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField>,GaugeLinkField> QuadOp(LapStencil,par.b0[i],fac*par.b1[i],fac*fac*par.b2); | ||||||
|  | #if USE_CHRONO | ||||||
|  |         MinvMom[i] = Forecast(QuadOp, right_nu, prev_solns[nu]); | ||||||
|  | #endif | ||||||
|  | #ifndef MIXED_CG | ||||||
|  |         CG(QuadOp,right_nu,MinvMom[i]); | ||||||
|  | #else | ||||||
|  |         QuadLinearOperator<CovariantAdjointLaplacianStencil<ImplF,typename ImplF::LinkField>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],fac*par.b1[i],fac*fac*par.b2); | ||||||
|  |     //    QuadLinearOperator<LaplacianAdjointField<ImplF>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],par.b1[i],par.b2); | ||||||
|  |         MixedPrecisionConjugateGradient<GaugeLinkField,GaugeLinkFieldF> MixedCG(par.tolerance,10000,10000,grid_f,QuadOpF,QuadOp); | ||||||
|  |         MixedCG.InnerTolerance=par.tolerance; | ||||||
|  |         MixedCG(right_nu,MinvMom[i]); | ||||||
|  |     #endif | ||||||
|  |     #if USE_CHRONO | ||||||
|  |         prev_solns[nu].push_back(MinvMom[i]); | ||||||
|  |     #endif | ||||||
|  |          | ||||||
|  |         GMom += par.a0[i]*MinvMom[i];  | ||||||
|  |         LapStencil.M(MinvMom[i],Gtemp2); | ||||||
|  |         GMom += par.a1[i]*fac*Gtemp2;  | ||||||
|  |         } | ||||||
|  |         for(int i =0;i<par.order;i++){ | ||||||
|  |         QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField>,GaugeLinkField> QuadOp(LapStencil,par.b0[i],fac*par.b1[i],fac*fac*par.b2); | ||||||
|  |      | ||||||
|  |         MinvGMom = Forecast(QuadOp, GMom, prev_solns[nu]); | ||||||
|  |     #ifndef MIXED_CG | ||||||
|  |         CG(QuadOp,GMom,MinvGMom); | ||||||
|  |         LapStencil.M(MinvGMom, Gtemp2); LMinvGMom=fac*Gtemp2; | ||||||
|  |         CG(QuadOp,right_nu,MinvMom[i]); | ||||||
|  |     #else | ||||||
|  |         QuadLinearOperator<CovariantAdjointLaplacianStencil<ImplF,typename ImplF::LinkField>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],fac*par.b1[i],fac*fac*par.b2); | ||||||
|  |     //    QuadLinearOperator<LaplacianAdjointField<ImplF>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],par.b1[i],par.b2); | ||||||
|  |         MixedPrecisionConjugateGradient<GaugeLinkField,GaugeLinkFieldF> MixedCG(par.tolerance,10000,10000,grid_f,QuadOpF,QuadOp); | ||||||
|  |         MixedCG.InnerTolerance=par.tolerance; | ||||||
|  |         MixedCG(GMom,MinvGMom); | ||||||
|  |         LapStencil.M(MinvGMom, Gtemp2); LMinvGMom=fac*Gtemp2; | ||||||
|  |     //    Laplacian.M(MinvGMom, LMinvGMom); | ||||||
|  |         MixedCG(right_nu,MinvMom[i]); | ||||||
|  |     #endif | ||||||
|  | #if USE_CHRONO | ||||||
|  |         prev_solns[nu].push_back(MinvGMom); | ||||||
|  | #endif | ||||||
|  |      | ||||||
|  |         LapStencil.M(MinvMom[i], Gtemp2); LMinvMom=fac*Gtemp2; | ||||||
|  |         AMinvMom = par.a1[i]*LMinvMom; | ||||||
|  |         AMinvMom += par.a0[i]*MinvMom[i]; | ||||||
|  |      | ||||||
|  |         LapStencil.M(AMinvMom, Gtemp2); LMinvAMom=fac*Gtemp2; | ||||||
|  |         LapStencil.M(MinvGMom, Gtemp2); temp=fac*Gtemp2; | ||||||
|  |         MinvAGMom = par.a1[i]*temp; | ||||||
|  |         MinvAGMom += par.a0[i]*MinvGMom; | ||||||
|  |         LapStencil.M(MinvAGMom, Gtemp2); LMinvAGMom=fac*Gtemp2; | ||||||
|  |      | ||||||
|  |      | ||||||
|  |         GaugeField tempDer(left.Grid()); | ||||||
|  |         std::vector<GaugeLinkField> DerLink(Nd,left.Grid()); | ||||||
|  |         std::vector<GaugeLinkField> tempDerLink(Nd,left.Grid()); | ||||||
|  |  | ||||||
|  |         std::cout<<GridLogMessage << "force contraction "<< i <<std::endl; | ||||||
|  |     //    roctxRangePushA("RMHMC force contraction"); | ||||||
|  |  #if 0 | ||||||
|  |         MDerivLink(GMom,MinvMom[i],tempDer); der += coef*2*par.a1[i]*tempDer; | ||||||
|  |         MDerivLink(left_nu,MinvGMom,tempDer); der += coef*2*par.a1[i]*tempDer; | ||||||
|  |         MDerivLink(LMinvAGMom,MinvMom[i],tempDer); der += coef*-2.*par.b2*tempDer; | ||||||
|  |         MDerivLink(LMinvAMom,MinvGMom,tempDer); der += coef*-2.*par.b2*tempDer; | ||||||
|  |         MDerivLink(MinvAGMom,LMinvMom,tempDer); der += coef*-2.*par.b2*tempDer; | ||||||
|  |         MDerivLink(AMinvMom,LMinvGMom,tempDer); der += coef*-2.*par.b2*tempDer; | ||||||
|  |         MDerivLink(MinvAGMom,MinvMom[i],tempDer); der += coef*-2.*par.b1[i]*tempDer; | ||||||
|  |         MDerivLink(AMinvMom,MinvGMom,tempDer); der += coef*-2.*par.b1[i]*tempDer; | ||||||
|  | #else | ||||||
|  | 	for (int mu=0;mu<Nd;mu++) DerLink[mu]=Zero(); | ||||||
|  |         MDerivLink(GMom,MinvMom[i],tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*2*par.a1[i]*tempDerLink[mu]; | ||||||
|  |         MDerivLink(left_nu,MinvGMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*2*par.a1[i]*tempDerLink[mu]; | ||||||
|  |         MDerivLink(LMinvAGMom,MinvMom[i],tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b2*tempDerLink[mu]; | ||||||
|  |         MDerivLink(LMinvAMom,MinvGMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b2*tempDerLink[mu]; | ||||||
|  |         MDerivLink(MinvAGMom,LMinvMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b2*tempDerLink[mu]; | ||||||
|  |         MDerivLink(AMinvMom,LMinvGMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b2*tempDerLink[mu]; | ||||||
|  |         MDerivLink(MinvAGMom,MinvMom[i],tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b1[i]*tempDerLink[mu]; | ||||||
|  |         MDerivLink(AMinvMom,MinvGMom,tempDerLink); 	for (int mu=0;mu<Nd;mu++) DerLink[mu] += coef*-2.*par.b1[i]*tempDerLink[mu]; | ||||||
|  | //      PokeIndex<LorentzIndex>(der, -factor * der_mu, mu); | ||||||
|  |         for (int mu=0;mu<Nd;mu++) PokeIndex<LorentzIndex>(tempDer, tempDerLink[mu], mu); | ||||||
|  |  | ||||||
|  | 	der += tempDer; | ||||||
|  | #endif | ||||||
|  |         std::cout<<GridLogMessage << "coef =  force contraction "<< i << "done "<< coef <<std::endl; | ||||||
|  |     //    roctxRangePop(); | ||||||
|  |      | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     std::cout<<GridLogMessage << "LaplaceEnd " <<std::endl; | ||||||
|  | //  exit(-42); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void MDeriv(const GaugeField& in, GaugeField& der) { | ||||||
|  |     MDeriv(in,in, der); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void MDeriv(const GaugeField& left, const GaugeField& right, | ||||||
|  |               GaugeField& der) { | ||||||
|  |  | ||||||
|  |     der=Zero(); | ||||||
|  |     MDerivInt(Mparam, left, right, der,prev_solnsMDeriv ); | ||||||
|  |     std::cout <<GridLogDebug << "MDeriv:norm2(der) = "<<norm2(der)<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void MinvDeriv(const GaugeField& in, GaugeField& der) { | ||||||
|  |     std::vector< std::vector<GaugeLinkField> > prev_solns(4); | ||||||
|  |     der=Zero(); | ||||||
|  |     MDerivInt(Gparam, in, in, der,prev_solnsMinvDeriv); | ||||||
|  |     std::cout <<GridLogDebug << "MinvDeriv:norm2(der) = "<<norm2(der)<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   void MSquareRootInt(LaplacianRatParams &par, GaugeField& P, std::vector< std::vector<GaugeLinkField> > & prev_solns ){ | ||||||
|  |  | ||||||
|  |     std::cout<<GridLogMessage << "LaplaceStart " <<std::endl; | ||||||
|  |     RealD fac = -1. / (double(4 * Nd)); | ||||||
|  |     LapStencil.GaugeImport(Usav); | ||||||
|  |     LapStencilF.GaugeImport(UsavF); | ||||||
|  |     for(int nu=0; nu<Nd;nu++){ | ||||||
|  |         GaugeLinkField P_nu = PeekIndex<LorentzIndex>(P, nu); | ||||||
|  |         GaugeLinkField Gp(P.Grid()); | ||||||
|  |         Gp = par.offset * P_nu; | ||||||
|  |         ConjugateGradient<GaugeLinkField> CG(par.tolerance,10000); | ||||||
|  |     //    ConjugateGradient<GaugeLinkFieldF> CG_f(1.0e-8,10000); | ||||||
|  |      | ||||||
|  |         ChronoForecast< QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField>,GaugeLinkField> , GaugeLinkField> Forecast; | ||||||
|  |      | ||||||
|  |         GaugeLinkField Gtemp(P.Grid()); | ||||||
|  |         GaugeLinkField Gtemp2(P.Grid()); | ||||||
|  |      | ||||||
|  |      | ||||||
|  |         for(int i =0;i<par.order;i++){ | ||||||
|  |         QuadLinearOperator<CovariantAdjointLaplacianStencil<Impl,typename Impl::LinkField>,GaugeLinkField> QuadOp(LapStencil,par.b0[i],fac*par.b1[i],fac*fac*par.b2); | ||||||
|  |      | ||||||
|  |         Gtemp = Forecast(QuadOp, P_nu, prev_solns[nu]); | ||||||
|  |     #ifndef MIXED_CG | ||||||
|  |         CG(QuadOp,P_nu,Gtemp); | ||||||
|  |     #else | ||||||
|  |         QuadLinearOperator<CovariantAdjointLaplacianStencil<ImplF,typename ImplF::LinkField>,GaugeLinkFieldF> QuadOpF(LapStencilF,par.b0[i],fac*par.b1[i],fac*fac*par.b2); | ||||||
|  |     //    QuadLinearOperator<LaplacianAdjointField<ImplF>,GaugeFieldF> QuadOpF(LapStencilF,par.b0[i],par.b1[i],par.b2); | ||||||
|  |         MixedPrecisionConjugateGradient<GaugeLinkField,GaugeLinkFieldF> MixedCG(par.tolerance,10000,10000,grid_f,QuadOpF,QuadOp); | ||||||
|  |         MixedCG.InnerTolerance=par.tolerance; | ||||||
|  |         MixedCG(P_nu,Gtemp); | ||||||
|  |     #endif | ||||||
|  |     #if USE_CHRONO | ||||||
|  |         prev_solns[nu].push_back(Gtemp); | ||||||
|  |     #endif | ||||||
|  |      | ||||||
|  |         Gp += par.a0[i]*Gtemp;  | ||||||
|  |         LapStencil.M(Gtemp,Gtemp2); | ||||||
|  |         Gp += par.a1[i]*fac*Gtemp2;  | ||||||
|  |         } | ||||||
|  |         PokeIndex<LorentzIndex>(P, Gp, nu); | ||||||
|  |     } | ||||||
|  |     std::cout<<GridLogMessage << "LaplaceEnd " <<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void MSquareRoot(GaugeField& P){ | ||||||
|  |     std::vector< std::vector<GaugeLinkField> > prev_solns(4); | ||||||
|  |     MSquareRootInt(Mparam,P,prev_solns); | ||||||
|  |     std::cout <<GridLogDebug << "MSquareRoot:norm2(P) = "<<norm2(P)<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void MInvSquareRoot(GaugeField& P){ | ||||||
|  |     std::vector< std::vector<GaugeLinkField> > prev_solns(4); | ||||||
|  |     MSquareRootInt(Gparam,P,prev_solns); | ||||||
|  |     std::cout <<GridLogDebug << "MInvSquareRoot:norm2(P) = "<<norm2(P)<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void M(const GaugeField& in, GaugeField& out) { | ||||||
|  |       out = in; | ||||||
|  |       std::vector< std::vector<GaugeLinkField> > prev_solns(4); | ||||||
|  |       MSquareRootInt(Mparam,out,prev_solns); | ||||||
|  |       MSquareRootInt(Mparam,out,prev_solns); | ||||||
|  |       std::cout <<GridLogDebug << "M:norm2(out) = "<<norm2(out)<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   void Minv(const GaugeField& in, GaugeField& inverted){ | ||||||
|  |       inverted = in; | ||||||
|  |       std::vector< std::vector<GaugeLinkField> > prev_solns(4); | ||||||
|  |       MSquareRootInt(Gparam,inverted,prev_solns); | ||||||
|  |       MSquareRootInt(Gparam,inverted,prev_solns); | ||||||
|  |       std::cout <<GridLogDebug << "Minv:norm2(inverted) = "<<norm2(inverted)<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | private: | ||||||
|  |   std::vector<GaugeLinkField> U; | ||||||
|  | }; | ||||||
|  | #undef MIXED_CG | ||||||
|  |  | ||||||
|  | NAMESPACE_END(Grid); | ||||||
| @@ -100,9 +100,6 @@ class GaugeGroup { | |||||||
|   using iGroupMatrix = iScalar<iScalar<iMatrix<vtype, ncolour> > >; |   using iGroupMatrix = iScalar<iScalar<iMatrix<vtype, ncolour> > >; | ||||||
|   template <typename vtype> |   template <typename vtype> | ||||||
|   using iAlgebraVector = iScalar<iScalar<iVector<vtype, AdjointDimension> > >; |   using iAlgebraVector = iScalar<iScalar<iVector<vtype, AdjointDimension> > >; | ||||||
|   template <typename vtype> |  | ||||||
|   using iSUnAlgebraMatrix = |  | ||||||
|     iScalar<iScalar<iMatrix<vtype, AdjointDimension> > >; |  | ||||||
|   static int su2subgroups(void) { return su2subgroups(group_name()); } |   static int su2subgroups(void) { return su2subgroups(group_name()); } | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -131,19 +128,10 @@ class GaugeGroup { | |||||||
|   typedef Lattice<vMatrix> LatticeMatrix; |   typedef Lattice<vMatrix> LatticeMatrix; | ||||||
|   typedef Lattice<vMatrixF> LatticeMatrixF; |   typedef Lattice<vMatrixF> LatticeMatrixF; | ||||||
|   typedef Lattice<vMatrixD> LatticeMatrixD; |   typedef Lattice<vMatrixD> LatticeMatrixD; | ||||||
|    |  | ||||||
|   typedef Lattice<vAlgebraVector> LatticeAlgebraVector; |   typedef Lattice<vAlgebraVector> LatticeAlgebraVector; | ||||||
|   typedef Lattice<vAlgebraVectorF> LatticeAlgebraVectorF; |   typedef Lattice<vAlgebraVectorF> LatticeAlgebraVectorF; | ||||||
|   typedef Lattice<vAlgebraVectorD> LatticeAlgebraVectorD; |   typedef Lattice<vAlgebraVectorD> LatticeAlgebraVectorD; | ||||||
|     |  | ||||||
|   typedef iSUnAlgebraMatrix<vComplex>  vAlgebraMatrix; |  | ||||||
|   typedef iSUnAlgebraMatrix<vComplexF> vAlgebraMatrixF; |  | ||||||
|   typedef iSUnAlgebraMatrix<vComplexD> vAlgebraMatrixD; |  | ||||||
|  |  | ||||||
|   typedef Lattice<vAlgebraMatrix>  LatticeAlgebraMatrix; |  | ||||||
|   typedef Lattice<vAlgebraMatrixF> LatticeAlgebraMatrixF; |  | ||||||
|   typedef Lattice<vAlgebraMatrixD> LatticeAlgebraMatrixD; |  | ||||||
|    |  | ||||||
|  |  | ||||||
|   typedef iSU2Matrix<Complex> SU2Matrix; |   typedef iSU2Matrix<Complex> SU2Matrix; | ||||||
|   typedef iSU2Matrix<ComplexF> SU2MatrixF; |   typedef iSU2Matrix<ComplexF> SU2MatrixF; | ||||||
| @@ -172,7 +160,7 @@ class GaugeGroup { | |||||||
|     return generator(lieIndex, ta, group_name()); |     return generator(lieIndex, ta, group_name()); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index) { |   static void su2SubGroupIndex(int &i1, int &i2, int su2_index) { | ||||||
|     return su2SubGroupIndex(i1, i2, su2_index, group_name()); |     return su2SubGroupIndex(i1, i2, su2_index, group_name()); | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -401,52 +389,6 @@ class GaugeGroup { | |||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
| // Ta are hermitian (?) |  | ||||||
| // Anti herm is i Ta basis |  | ||||||
| static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in, int b) |  | ||||||
| { |  | ||||||
|   conformable(in, out); |  | ||||||
|   GridBase *grid = out.Grid(); |  | ||||||
|   LatticeComplex tmp(grid); |  | ||||||
|   Matrix ta; |  | ||||||
|   // Using Luchang's projection convention |  | ||||||
|   //  2 Tr{Ta Tb} A_b= 2/2 delta ab A_b = A_a |  | ||||||
|   autoView(out_v,out,AcceleratorWrite); |  | ||||||
|   autoView(in_v,in,AcceleratorRead); |  | ||||||
|   int N = ncolour; |  | ||||||
|   int NNm1 = N * (N - 1); |  | ||||||
|   int hNNm1= NNm1/2; |  | ||||||
|   RealD sqrt_2 = sqrt(2.0); |  | ||||||
|   Complex ci(0.0,1.0); |  | ||||||
|   for(int su2Index=0;su2Index<hNNm1;su2Index++){ |  | ||||||
|     int i1, i2; |  | ||||||
|     su2SubGroupIndex(i1, i2, su2Index); |  | ||||||
|     int ax = su2Index*2; |  | ||||||
|     int ay = su2Index*2+1; |  | ||||||
|     accelerator_for(ss,grid->oSites(),1,{ |  | ||||||
| 	// in is traceless ANTI-hermitian whereas Grid generators are Hermitian. |  | ||||||
| 	// trace( Ta x Ci in) |  | ||||||
| 	// Bet I need to move to real part with mult by -i |  | ||||||
| 	out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2))); |  | ||||||
| 	out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1))); |  | ||||||
|       }); |  | ||||||
|   } |  | ||||||
|   for(int diagIndex=0;diagIndex<N-1;diagIndex++){ |  | ||||||
|     int k = diagIndex + 1; // diagIndex starts from 0 |  | ||||||
|     int a = NNm1+diagIndex; |  | ||||||
|     RealD scale = 1.0/sqrt(2.0*k*(k+1)); |  | ||||||
|     accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{ |  | ||||||
| 	auto tmp = in_v[ss]()()(0,0); |  | ||||||
| 	for(int i=1;i<k;i++){ |  | ||||||
| 	  tmp=tmp+in_v[ss]()()(i,i); |  | ||||||
| 	} |  | ||||||
| 	tmp = tmp - in_v[ss]()()(k,k)*k; |  | ||||||
| 	out_v[ss]()()(a,b) =imag(tmp) * scale; |  | ||||||
|       }); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
|    |  | ||||||
| }; | }; | ||||||
|      |      | ||||||
| template <int ncolour> | template <int ncolour> | ||||||
|   | |||||||
| @@ -7,6 +7,7 @@ Source file: ./lib/qcd/hmc/integrators/Integrator.h | |||||||
| Copyright (C) 2015 | Copyright (C) 2015 | ||||||
|  |  | ||||||
| Author: Guido Cossu <guido.cossu@ed.ac.uk> | Author: Guido Cossu <guido.cossu@ed.ac.uk> | ||||||
|  | Author: Chulwoo Jung <chulwoo@bnl.gov> | ||||||
|  |  | ||||||
| This program is free software; you can redistribute it and/or modify | This program is free software; you can redistribute it and/or modify | ||||||
| it under the terms of the GNU General Public License as published by | it under the terms of the GNU General Public License as published by | ||||||
| @@ -33,7 +34,12 @@ NAMESPACE_BEGIN(Grid); | |||||||
|  |  | ||||||
| template <typename Field>  | template <typename Field>  | ||||||
| class Metric{ | class Metric{ | ||||||
|  | protected: | ||||||
|  |   int triv; | ||||||
| public: | public: | ||||||
|  |   Metric(){this->triv=1;} | ||||||
|  |   int Trivial(){ return triv;} | ||||||
|  | //printf("Metric::Trivial=%d\n",triv); ; | ||||||
|   virtual void ImportGauge(const Field&)   = 0; |   virtual void ImportGauge(const Field&)   = 0; | ||||||
|   virtual void M(const Field&, Field&)     = 0; |   virtual void M(const Field&, Field&)     = 0; | ||||||
|   virtual void Minv(const Field&, Field&)  = 0; |   virtual void Minv(const Field&, Field&)  = 0; | ||||||
| @@ -41,6 +47,8 @@ public: | |||||||
|   virtual void MInvSquareRoot(Field&) = 0; |   virtual void MInvSquareRoot(Field&) = 0; | ||||||
|   virtual void MDeriv(const Field&, Field&) = 0; |   virtual void MDeriv(const Field&, Field&) = 0; | ||||||
|   virtual void MDeriv(const Field&, const Field&, Field&) = 0; |   virtual void MDeriv(const Field&, const Field&, Field&) = 0; | ||||||
|  |   virtual void MinvDeriv(const Field&, Field&) = 0; | ||||||
|  | //  virtual void MinvDeriv(const Field&, const Field&, Field&) = 0; | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -48,23 +56,36 @@ public: | |||||||
| template <typename Field> | template <typename Field> | ||||||
| class TrivialMetric : public Metric<Field>{ | class TrivialMetric : public Metric<Field>{ | ||||||
| public: | public: | ||||||
|  | //  TrivialMetric(){this->triv=1;printf("TrivialMetric::triv=%d\n",this->Trivial());} | ||||||
|   virtual void ImportGauge(const Field&){}; |   virtual void ImportGauge(const Field&){}; | ||||||
|   virtual void M(const Field& in, Field& out){ |   virtual void M(const Field& in, Field& out){ | ||||||
|  | //    printf("M:norm=%0.15e\n",norm2(in)); | ||||||
|  |     std::cout << GridLogIntegrator << " M:norm(in)= " << std::sqrt(norm2(in)) << std::endl; | ||||||
|     out = in; |     out = in; | ||||||
|   } |   } | ||||||
|   virtual void Minv(const Field& in, Field& out){ |   virtual void Minv(const Field& in, Field& out){ | ||||||
|  |     std::cout << GridLogIntegrator << " Minv:norm(in)= " << std::sqrt(norm2(in)) << std::endl; | ||||||
|     out = in; |     out = in; | ||||||
|   } |   } | ||||||
|   virtual void MSquareRoot(Field& P){ |   virtual void MSquareRoot(Field& P){ | ||||||
|  |     std::cout << GridLogIntegrator << " MSquareRoot:norm(P)= " << std::sqrt(norm2(P)) << std::endl; | ||||||
|     // do nothing |     // do nothing | ||||||
|   } |   } | ||||||
|   virtual void MInvSquareRoot(Field& P){ |   virtual void MInvSquareRoot(Field& P){ | ||||||
|  |     std::cout << GridLogIntegrator << " MInvSquareRoot:norm(P)= " << std::sqrt(norm2(P)) << std::endl; | ||||||
|     // do nothing |     // do nothing | ||||||
|   } |   } | ||||||
|   virtual void MDeriv(const Field& in, Field& out){ |   virtual void MDeriv(const Field& in, Field& out){ | ||||||
|  |     std::cout << GridLogIntegrator << " MDeriv:norm(in)= " << std::sqrt(norm2(in)) << std::endl; | ||||||
|  |     out = Zero(); | ||||||
|  |   } | ||||||
|  |   virtual void MinvDeriv(const Field& in, Field& out){ | ||||||
|  |     std::cout << GridLogIntegrator << " MinvDeriv:norm(in)= " << std::sqrt(norm2(in)) << std::endl; | ||||||
|     out = Zero(); |     out = Zero(); | ||||||
|   } |   } | ||||||
|   virtual void MDeriv(const Field& left, const Field& right, Field& out){ |   virtual void MDeriv(const Field& left, const Field& right, Field& out){ | ||||||
|  |     std::cout << GridLogIntegrator << " MDeriv:norm(left)= " << std::sqrt(norm2(left)) << std::endl; | ||||||
|  |     std::cout << GridLogIntegrator << " MDeriv:norm(right)= " << std::sqrt(norm2(right)) << std::endl; | ||||||
|     out = Zero(); |     out = Zero(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -101,14 +122,15 @@ public: | |||||||
|     // Generate gaussian momenta |     // Generate gaussian momenta | ||||||
|     Implementation::generate_momenta(Mom, sRNG, pRNG); |     Implementation::generate_momenta(Mom, sRNG, pRNG); | ||||||
|     // Modify the distribution with the metric |     // Modify the distribution with the metric | ||||||
|  | //    if(M.Trivial()) return; | ||||||
|     M.MSquareRoot(Mom); |     M.MSquareRoot(Mom); | ||||||
|  |  | ||||||
|     if (1) { |     if (1) { | ||||||
|       // Auxiliary momenta |       // Auxiliary momenta | ||||||
|       // do nothing if trivial, so hide in the metric |       // do nothing if trivial, so hide in the metric | ||||||
|       MomentaField AuxMomTemp(Mom.Grid()); |       MomentaField AuxMomTemp(Mom.Grid()); | ||||||
|       Implementation::generate_momenta(AuxMom, sRNG, pRNG); |       Implementation::generate_momenta(AuxMom, sRNG,pRNG); | ||||||
|       Implementation::generate_momenta(AuxField, sRNG, pRNG); |       Implementation::generate_momenta(AuxField, sRNG,pRNG); | ||||||
|       // Modify the distribution with the metric |       // Modify the distribution with the metric | ||||||
|       // Aux^dag M Aux |       // Aux^dag M Aux | ||||||
|       M.MInvSquareRoot(AuxMom);  // AuxMom = M^{-1/2} AuxMomTemp |       M.MInvSquareRoot(AuxMom);  // AuxMom = M^{-1/2} AuxMomTemp | ||||||
| @@ -117,11 +139,12 @@ public: | |||||||
|  |  | ||||||
|   // Correct |   // Correct | ||||||
|   RealD MomentaAction(){ |   RealD MomentaAction(){ | ||||||
|  |     static RealD Saux=0.,Smom=0.; | ||||||
|     MomentaField inv(Mom.Grid()); |     MomentaField inv(Mom.Grid()); | ||||||
|     inv = Zero(); |     inv = Zero(); | ||||||
|     M.Minv(Mom, inv); |     M.Minv(Mom, inv); | ||||||
|     LatticeComplex Hloc(Mom.Grid()); |     LatticeComplex Hloc(Mom.Grid()); Hloc = Zero(); | ||||||
|     Hloc = Zero(); |     LatticeComplex Hloc2(Mom.Grid()); Hloc2 = Zero(); | ||||||
|     for (int mu = 0; mu < Nd; mu++) { |     for (int mu = 0; mu < Nd; mu++) { | ||||||
|       // This is not very general |       // This is not very general | ||||||
|       // hide in the metric |       // hide in the metric | ||||||
| @@ -129,8 +152,15 @@ public: | |||||||
|       auto inv_mu = PeekIndex<LorentzIndex>(inv, mu); |       auto inv_mu = PeekIndex<LorentzIndex>(inv, mu); | ||||||
|       Hloc += trace(Mom_mu * inv_mu); |       Hloc += trace(Mom_mu * inv_mu); | ||||||
|     } |     } | ||||||
|  |     auto Htmp1 = TensorRemove(sum(Hloc)); | ||||||
|  |     std::cout << GridLogMessage << "S:dSmom = " << Htmp1.real()-Smom << "\n"; | ||||||
|  |     Smom=Htmp1.real()/HMC_MOMENTUM_DENOMINATOR; | ||||||
|  |      | ||||||
|  |  | ||||||
|     if (1) { |      | ||||||
|  |  | ||||||
|  | //    if(!M.Trivial())  | ||||||
|  |     { | ||||||
|       // Auxiliary Fields |       // Auxiliary Fields | ||||||
|       // hide in the metric |       // hide in the metric | ||||||
|       M.M(AuxMom, inv); |       M.M(AuxMom, inv); | ||||||
| @@ -140,13 +170,18 @@ public: | |||||||
|         auto inv_mu = PeekIndex<LorentzIndex>(inv, mu); |         auto inv_mu = PeekIndex<LorentzIndex>(inv, mu); | ||||||
|         auto am_mu = PeekIndex<LorentzIndex>(AuxMom, mu); |         auto am_mu = PeekIndex<LorentzIndex>(AuxMom, mu); | ||||||
|         auto af_mu = PeekIndex<LorentzIndex>(AuxField, mu); |         auto af_mu = PeekIndex<LorentzIndex>(AuxField, mu); | ||||||
|         Hloc += trace(am_mu * inv_mu);// p M p |         Hloc += trace(am_mu * inv_mu); | ||||||
|         Hloc += trace(af_mu * af_mu); |         Hloc2 += trace(af_mu * af_mu); | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|  |     auto Htmp2 = TensorRemove(sum(Hloc))-Htmp1; | ||||||
|  |     std::cout << GridLogMessage << "S:dSaux = " << Htmp2.real()-Saux << "\n"; | ||||||
|  |     Saux=Htmp2.real(); | ||||||
|  |  | ||||||
|     auto Hsum = TensorRemove(sum(Hloc)); |     auto Hsum = TensorRemove(sum(Hloc))/HMC_MOMENTUM_DENOMINATOR; | ||||||
|     return Hsum.real(); |     auto Hsum2 = TensorRemove(sum(Hloc2)); | ||||||
|  |     std::cout << GridLogIntegrator << "MomentaAction: " <<  Hsum.real()+Hsum2.real() << std::endl; | ||||||
|  |     return Hsum.real()+Hsum2.real(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   // Correct |   // Correct | ||||||
| @@ -157,15 +192,17 @@ public: | |||||||
|     MomentaField MDer(in.Grid()); |     MomentaField MDer(in.Grid()); | ||||||
|     MomentaField X(in.Grid()); |     MomentaField X(in.Grid()); | ||||||
|     X = Zero(); |     X = Zero(); | ||||||
|     M.Minv(in, X);  // X = G in |     M.MinvDeriv(in, MDer);  // MDer = U * dS/dU | ||||||
|     M.MDeriv(X, MDer);  // MDer = U * dS/dU |     der = -1.0* Implementation::projectForce(MDer);  // Ta if gauge fields | ||||||
|     der = Implementation::projectForce(MDer);  // Ta if gauge fields | //    std::cout << GridLogIntegrator << " DerivativeU: norm(in)= " << std::sqrt(norm2(in)) << std::endl; | ||||||
|  | //    std::cout << GridLogIntegrator << " DerivativeU: norm(der)= " << std::sqrt(norm2(der)) << std::endl; | ||||||
|      |      | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void AuxiliaryFieldsDerivative(MomentaField& der){ |   void AuxiliaryFieldsDerivative(MomentaField& der){ | ||||||
|     der = Zero(); |     der = Zero(); | ||||||
|     if (1){ | //    if(!M.Trivial())  | ||||||
|  |     { | ||||||
|       // Auxiliary fields |       // Auxiliary fields | ||||||
|       MomentaField der_temp(der.Grid()); |       MomentaField der_temp(der.Grid()); | ||||||
|       MomentaField X(der.Grid()); |       MomentaField X(der.Grid()); | ||||||
| @@ -173,6 +210,7 @@ public: | |||||||
|       //M.M(AuxMom, X); // X = M Aux |       //M.M(AuxMom, X); // X = M Aux | ||||||
|       // Two derivative terms |       // Two derivative terms | ||||||
|       // the Mderiv need separation of left and right terms |       // the Mderiv need separation of left and right terms | ||||||
|  |     std::cout << GridLogIntegrator << " AuxiliaryFieldsDerivative:norm(AuxMom)= " << std::sqrt(norm2(AuxMom)) << std::endl; | ||||||
|       M.MDeriv(AuxMom, der);  |       M.MDeriv(AuxMom, der);  | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -180,6 +218,7 @@ public: | |||||||
|       //M.MDeriv(X, AuxMom, der_temp); der += der_temp; |       //M.MDeriv(X, AuxMom, der_temp); der += der_temp; | ||||||
|  |  | ||||||
|       der = -1.0*Implementation::projectForce(der); |       der = -1.0*Implementation::projectForce(der); | ||||||
|  |       std::cout << GridLogIntegrator << " AuxiliaryFieldsDerivative:norm(der)= " << std::sqrt(norm2(der)) << std::endl; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
| @@ -189,22 +228,28 @@ public: | |||||||
|     // is the projection necessary here? |     // is the projection necessary here? | ||||||
|     // no for fields in the algebra |     // no for fields in the algebra | ||||||
|     der = Implementation::projectForce(der);  |     der = Implementation::projectForce(der);  | ||||||
|  |     std::cout << GridLogIntegrator << " DerivativeP:norm(der)= " << std::sqrt(norm2(der)) << std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void update_auxiliary_momenta(RealD ep){ |   void update_auxiliary_momenta(RealD ep){ | ||||||
|     if(1){ |       std::cout << GridLogIntegrator << "AuxMom update_auxiliary_fields: " << std::sqrt(norm2(AuxMom)) << std::endl; | ||||||
|       AuxMom -= ep * AuxField; |       std::cout << GridLogIntegrator << "AuxField update_auxiliary_fields: " << std::sqrt(norm2(AuxField)) << std::endl; | ||||||
|  |     { | ||||||
|  |       AuxMom -= ep * AuxField * HMC_MOMENTUM_DENOMINATOR; | ||||||
|  |       std::cout << GridLogIntegrator << "AuxMom update_auxiliary_fields: " << std::sqrt(norm2(AuxMom)) << std::endl; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void update_auxiliary_fields(RealD ep){ |   void update_auxiliary_fields(RealD ep){ | ||||||
|     if (1) { | //    if(!M.Trivial())  | ||||||
|  |     { | ||||||
|       MomentaField tmp(AuxMom.Grid()); |       MomentaField tmp(AuxMom.Grid()); | ||||||
|       MomentaField tmp2(AuxMom.Grid()); |       MomentaField tmp2(AuxMom.Grid()); | ||||||
|       M.M(AuxMom, tmp); |       M.M(AuxMom, tmp); | ||||||
|       // M.M(tmp, tmp2); |       // M.M(tmp, tmp2); | ||||||
|       AuxField += ep * tmp;  // M^2 AuxMom |       AuxField += ep * tmp;  // M^2 AuxMom | ||||||
|       // factor of 2? |       // factor of 2? | ||||||
|  |       std::cout << GridLogIntegrator << "AuxField update_auxiliary_fields: " << std::sqrt(norm2(AuxField)) << std::endl; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -10,7 +10,6 @@ | |||||||
| // doesn't get found by the scripts/filelist during bootstrapping. | // doesn't get found by the scripts/filelist during bootstrapping. | ||||||
|  |  | ||||||
| private: | private: | ||||||
|  |  | ||||||
| template <ONLY_IF_SU> | template <ONLY_IF_SU> | ||||||
| static int su2subgroups(GroupName::SU) { return (ncolour * (ncolour - 1)) / 2; } | static int su2subgroups(GroupName::SU) { return (ncolour * (ncolour - 1)) / 2; } | ||||||
| //////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////// | ||||||
| @@ -577,4 +576,3 @@ static void RandomGaugeTransform(GridParallelRNG &pRNG, typename Gimpl::GaugeFie | |||||||
|   LieRandomize(pRNG,g,1.0); |   LieRandomize(pRNG,g,1.0); | ||||||
|   GaugeTransform<Gimpl>(Umu,g); |   GaugeTransform<Gimpl>(Umu,g); | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1133,13 +1133,4 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc | |||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
| #ifdef GRID_SYCL |  | ||||||
| template<> struct sycl::is_device_copyable<Grid::vComplexF> : public std::true_type {}; |  | ||||||
| template<> struct sycl::is_device_copyable<Grid::vComplexD> : public std::true_type {}; |  | ||||||
| template<> struct sycl::is_device_copyable<Grid::vRealF   > : public std::true_type {}; |  | ||||||
| template<> struct sycl::is_device_copyable<Grid::vRealD   > : public std::true_type {}; |  | ||||||
| template<> struct sycl::is_device_copyable<Grid::vInteger > : public std::true_type {}; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -218,10 +218,6 @@ public: | |||||||
|     // ------------------------------------------------- |     // ------------------------------------------------- | ||||||
|     // misc |     // misc | ||||||
|     // ------------------------------------------------- |     // ------------------------------------------------- | ||||||
|     void discardhi(uint64_t z) { |  | ||||||
|       _s[3] += z; |  | ||||||
|       encrypt_counter(); |  | ||||||
|     } |  | ||||||
|      |      | ||||||
|     // req: 26.5.1.4 Random number engine requirements, p.908 table 117, row 9 |     // req: 26.5.1.4 Random number engine requirements, p.908 table 117, row 9 | ||||||
|     // Advances e’s state ei to ei+z by any means equivalent to z |     // Advances e’s state ei to ei+z by any means equivalent to z | ||||||
| @@ -391,4 +387,4 @@ private: | |||||||
| #undef MIXK | #undef MIXK | ||||||
| #undef MIX2 | #undef MIX2 | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
| @@ -137,55 +137,5 @@ public: | |||||||
|    |    | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| //////////////////////////////////////////////// |  | ||||||
| // Some machinery to streamline making a stencil  |  | ||||||
| //////////////////////////////////////////////// |  | ||||||
|  |  | ||||||
| class shiftSignal { |  | ||||||
| public: |  | ||||||
|     enum { |  | ||||||
|         BACKWARD_CONST = 16, |  | ||||||
|         NO_SHIFT       = -1 |  | ||||||
|     }; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| // TODO: put a check somewhere that BACKWARD_CONST > Nd! |  | ||||||
|  |  | ||||||
| /*!  @brief signals that you want to go backwards in direction dir */ |  | ||||||
| inline int Back(const int dir) { |  | ||||||
|     // generalShift will use BACKWARD_CONST to determine whether we step forward or  |  | ||||||
|     // backward. Trick inspired by SIMULATeQCD.  |  | ||||||
|     return dir + shiftSignal::BACKWARD_CONST; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /*!  @brief shift one unit in direction dir */ |  | ||||||
| template<typename... Args> |  | ||||||
| void generalShift(Coordinate& shift, int dir) { |  | ||||||
|     if (dir >= shiftSignal::BACKWARD_CONST) { |  | ||||||
|         dir -= shiftSignal::BACKWARD_CONST; |  | ||||||
|         shift[dir]+=-1; |  | ||||||
|     } else if (dir == shiftSignal::NO_SHIFT) { |  | ||||||
|         ; // do nothing |  | ||||||
|     } else { |  | ||||||
|         shift[dir]+=1; |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /*!  @brief follow a path of directions, shifting one unit in each direction */ |  | ||||||
| template<typename... Args> |  | ||||||
| void generalShift(Coordinate& shift, int dir, Args... args) { |  | ||||||
|     if (dir >= shiftSignal::BACKWARD_CONST) { |  | ||||||
|         dir -= shiftSignal::BACKWARD_CONST; |  | ||||||
|         shift[dir]+=-1; |  | ||||||
|     } else if (dir == shiftSignal::NO_SHIFT) { |  | ||||||
|         ; // do nothing |  | ||||||
|     } else { |  | ||||||
|         shift[dir]+=1; |  | ||||||
|     } |  | ||||||
|     generalShift(shift, args...); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -70,6 +70,57 @@ struct DefaultImplParams { | |||||||
| void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, | void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, | ||||||
| 				 int off,std::vector<std::pair<int,int> > & table); | 				 int off,std::vector<std::pair<int,int> > & table); | ||||||
|  |  | ||||||
|  | /* | ||||||
|  | template<class vobj,class cobj,class compressor> | ||||||
|  | void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)   __attribute__((noinline)); | ||||||
|  |  | ||||||
|  | template<class vobj,class cobj,class compressor> | ||||||
|  | void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so) | ||||||
|  | { | ||||||
|  |   int num=table.size(); | ||||||
|  |   std::pair<int,int> *table_v = & table[0]; | ||||||
|  |  | ||||||
|  |   auto rhs_v = rhs.View(AcceleratorRead); | ||||||
|  |   accelerator_forNB( i,num, vobj::Nsimd(), { | ||||||
|  |     compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]); | ||||||
|  |   }); | ||||||
|  |   rhs_v.ViewClose(); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /////////////////////////////////////////////////////////////////// | ||||||
|  | // Gather for when there *is* need to SIMD split with compression | ||||||
|  | /////////////////////////////////////////////////////////////////// | ||||||
|  | template<class cobj,class vobj,class compressor> | ||||||
|  | void Gather_plane_exchange_table(const Lattice<vobj> &rhs, | ||||||
|  | 				 commVector<cobj *> pointers, | ||||||
|  | 				 int dimension,int plane, | ||||||
|  | 				 int cbmask,compressor &compress,int type) __attribute__((noinline)); | ||||||
|  |  | ||||||
|  | template<class cobj,class vobj,class compressor> | ||||||
|  | void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table, | ||||||
|  | 				 const Lattice<vobj> &rhs, | ||||||
|  | 				 std::vector<cobj *> &pointers,int dimension,int plane,int cbmask, | ||||||
|  | 				 compressor &compress,int type) | ||||||
|  | { | ||||||
|  |   assert( (table.size()&0x1)==0); | ||||||
|  |   int num=table.size()/2; | ||||||
|  |   int so  = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane | ||||||
|  |  | ||||||
|  |   auto rhs_v = rhs.View(AcceleratorRead); | ||||||
|  |   auto rhs_p = &rhs_v[0]; | ||||||
|  |   auto p0=&pointers[0][0]; | ||||||
|  |   auto p1=&pointers[1][0]; | ||||||
|  |   auto tp=&table[0]; | ||||||
|  |   accelerator_forNB(j, num, vobj::Nsimd(), { | ||||||
|  |       compress.CompressExchange(p0,p1, rhs_p, j, | ||||||
|  | 				so+tp[2*j  ].second, | ||||||
|  | 				so+tp[2*j+1].second, | ||||||
|  | 				type); | ||||||
|  |   }); | ||||||
|  |   rhs_v.ViewClose(); | ||||||
|  | } | ||||||
|  | */ | ||||||
|  |  | ||||||
| void DslashResetCounts(void); | void DslashResetCounts(void); | ||||||
| void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full); | void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full); | ||||||
| void DslashLogFull(void); | void DslashLogFull(void); | ||||||
| @@ -207,10 +258,6 @@ public: | |||||||
|   struct Packet { |   struct Packet { | ||||||
|     void * send_buf; |     void * send_buf; | ||||||
|     void * recv_buf; |     void * recv_buf; | ||||||
| #ifndef ACCELERATOR_AWARE_MPI |  | ||||||
|     void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE |  | ||||||
|     void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE |  | ||||||
| #endif |  | ||||||
|     Integer to_rank; |     Integer to_rank; | ||||||
|     Integer from_rank; |     Integer from_rank; | ||||||
|     Integer do_send; |     Integer do_send; | ||||||
| @@ -277,7 +324,7 @@ public: | |||||||
|   Vector<int> surface_list; |   Vector<int> surface_list; | ||||||
|  |  | ||||||
|   stencilVector<StencilEntry>  _entries; // Resident in managed memory |   stencilVector<StencilEntry>  _entries; // Resident in managed memory | ||||||
|   commVector<StencilEntry>     _entries_device; // Resident in device memory |   commVector<StencilEntry>     _entries_device; // Resident in managed memory | ||||||
|   std::vector<Packet> Packets; |   std::vector<Packet> Packets; | ||||||
|   std::vector<Merge> Mergers; |   std::vector<Merge> Mergers; | ||||||
|   std::vector<Merge> MergersSHM; |   std::vector<Merge> MergersSHM; | ||||||
| @@ -361,16 +408,33 @@ public: | |||||||
|   // Use OpenMP Tasks for cleaner ??? |   // Use OpenMP Tasks for cleaner ??? | ||||||
|   // must be called *inside* parallel region |   // must be called *inside* parallel region | ||||||
|   ////////////////////////////////////////// |   ////////////////////////////////////////// | ||||||
|  |   /* | ||||||
|  |   void CommunicateThreaded() | ||||||
|  |   { | ||||||
|  | #ifdef GRID_OMP | ||||||
|  |     int mythread = omp_get_thread_num(); | ||||||
|  |     int nthreads = CartesianCommunicator::nCommThreads; | ||||||
|  | #else | ||||||
|  |     int mythread = 0; | ||||||
|  |     int nthreads = 1; | ||||||
|  | #endif | ||||||
|  |     if (nthreads == -1) nthreads = 1; | ||||||
|  |     if (mythread < nthreads) { | ||||||
|  |       for (int i = mythread; i < Packets.size(); i += nthreads) { | ||||||
|  | 	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, | ||||||
|  | 						      Packets[i].to_rank, | ||||||
|  | 						      Packets[i].recv_buf, | ||||||
|  | 						      Packets[i].from_rank, | ||||||
|  | 						      Packets[i].bytes,i); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   */ | ||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   // Non blocking send and receive. Necessarily parallel. |   // Non blocking send and receive. Necessarily parallel. | ||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|     // All GPU kernel tasks must complete |  | ||||||
|     //    accelerator_barrier();     // All kernels should ALREADY be complete |  | ||||||
|     //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer |  | ||||||
|                                // But the HaloGather had a barrier too. |  | ||||||
| #ifdef ACCELERATOR_AWARE_MPI |  | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|       _grid->StencilSendToRecvFromBegin(MpiReqs, |       _grid->StencilSendToRecvFromBegin(MpiReqs, | ||||||
| 					Packets[i].send_buf, | 					Packets[i].send_buf, | ||||||
| @@ -379,54 +443,16 @@ public: | |||||||
| 					Packets[i].from_rank,Packets[i].do_recv, | 					Packets[i].from_rank,Packets[i].do_recv, | ||||||
| 					Packets[i].xbytes,Packets[i].rbytes,i); | 					Packets[i].xbytes,Packets[i].rbytes,i); | ||||||
|     } |     } | ||||||
| #else |  | ||||||
| #warning "Using COPY VIA HOST BUFFERS IN STENCIL" |  | ||||||
|     for(int i=0;i<Packets.size();i++){ |  | ||||||
|       // Introduce a host buffer with a cheap slab allocator and zero cost wipe all |  | ||||||
|       Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes); |  | ||||||
|       Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes); |  | ||||||
|       if ( Packets[i].do_send ) { |  | ||||||
| 	acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes); |  | ||||||
|       } |  | ||||||
|       _grid->StencilSendToRecvFromBegin(MpiReqs, |  | ||||||
| 					Packets[i].host_send_buf, |  | ||||||
| 					Packets[i].to_rank,Packets[i].do_send, |  | ||||||
| 					Packets[i].host_recv_buf, |  | ||||||
| 					Packets[i].from_rank,Packets[i].do_recv, |  | ||||||
| 					Packets[i].xbytes,Packets[i].rbytes,i); |  | ||||||
|     } |  | ||||||
| #endif |  | ||||||
|     // Get comms started then run checksums |  | ||||||
|     // Having this PRIOR to the dslash seems to make Sunspot work... (!) |  | ||||||
|     for(int i=0;i<Packets.size();i++){ |  | ||||||
|       if ( Packets[i].do_send ) |  | ||||||
| 	FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes); |  | ||||||
|     } |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|     _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done |     _grid->StencilSendToRecvFromComplete(MpiReqs,0); | ||||||
|     if   ( this->partialDirichlet ) DslashLogPartial(); |     if   ( this->partialDirichlet ) DslashLogPartial(); | ||||||
|     else if ( this->fullDirichlet ) DslashLogDirichlet(); |     else if ( this->fullDirichlet ) DslashLogDirichlet(); | ||||||
|     else DslashLogFull(); |     else DslashLogFull(); | ||||||
|     // acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete |     acceleratorCopySynchronise(); | ||||||
|     //    accelerator_barrier();  |  | ||||||
|     _grid->StencilBarrier();  |     _grid->StencilBarrier();  | ||||||
| #ifndef ACCELERATOR_AWARE_MPI |  | ||||||
| #warning "Using COPY VIA HOST BUFFERS IN STENCIL" |  | ||||||
|     for(int i=0;i<Packets.size();i++){ |  | ||||||
|       if ( Packets[i].do_recv ) { |  | ||||||
| 	acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     _grid->HostBufferFreeAll(); |  | ||||||
| #endif |  | ||||||
|     // run any checksums |  | ||||||
|     for(int i=0;i<Packets.size();i++){ |  | ||||||
|       if ( Packets[i].do_recv ) |  | ||||||
| 	FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank); |  | ||||||
|     } |  | ||||||
|   } |   } | ||||||
|   //////////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////////// | ||||||
|   // Blocking send and receive. Either sequential or parallel. |   // Blocking send and receive. Either sequential or parallel. | ||||||
| @@ -502,7 +528,6 @@ public: | |||||||
|   template<class compressor> |   template<class compressor> | ||||||
|   void HaloGather(const Lattice<vobj> &source,compressor &compress) |   void HaloGather(const Lattice<vobj> &source,compressor &compress) | ||||||
|   { |   { | ||||||
|     //    accelerator_barrier(); |  | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes |     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||||
|  |  | ||||||
|     assert(source.Grid()==_grid); |     assert(source.Grid()==_grid); | ||||||
| @@ -515,9 +540,10 @@ public: | |||||||
|       compress.Point(point); |       compress.Point(point); | ||||||
|       HaloGatherDir(source,compress,point,face_idx); |       HaloGatherDir(source,compress,point,face_idx); | ||||||
|     } |     } | ||||||
|     accelerator_barrier(); // All my local gathers are complete |     accelerator_barrier(); | ||||||
|     face_table_computed=1; |     face_table_computed=1; | ||||||
|     assert(u_comm_offset==_unified_buffer_size); |     assert(u_comm_offset==_unified_buffer_size); | ||||||
|  |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   ///////////////////////// |   ///////////////////////// | ||||||
| @@ -553,7 +579,6 @@ public: | |||||||
|       accelerator_forNB(j, words, cobj::Nsimd(), { |       accelerator_forNB(j, words, cobj::Nsimd(), { | ||||||
| 	  coalescedWrite(to[j] ,coalescedRead(from [j])); | 	  coalescedWrite(to[j] ,coalescedRead(from [j])); | ||||||
|       }); |       }); | ||||||
|       acceleratorFenceComputeStream(); |  | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|    |    | ||||||
| @@ -644,7 +669,6 @@ public: | |||||||
|     for(int i=0;i<dd.size();i++){ |     for(int i=0;i<dd.size();i++){ | ||||||
|       decompressor::DecompressFace(decompress,dd[i]); |       decompressor::DecompressFace(decompress,dd[i]); | ||||||
|     } |     } | ||||||
|     acceleratorFenceComputeStream(); // dependent kernels |  | ||||||
|   } |   } | ||||||
|   //////////////////////////////////////// |   //////////////////////////////////////// | ||||||
|   // Set up routines |   // Set up routines | ||||||
| @@ -682,7 +706,7 @@ public: | |||||||
| 	} | 	} | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|     //std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl; |     std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl; | ||||||
|   } |   } | ||||||
|   /// Introduce a block structure and switch off comms on boundaries |   /// Introduce a block structure and switch off comms on boundaries | ||||||
|   void DirichletBlock(const Coordinate &dirichlet_block) |   void DirichletBlock(const Coordinate &dirichlet_block) | ||||||
| @@ -737,8 +761,7 @@ public: | |||||||
| 		   int checkerboard, | 		   int checkerboard, | ||||||
| 		   const std::vector<int> &directions, | 		   const std::vector<int> &directions, | ||||||
| 		   const std::vector<int> &distances, | 		   const std::vector<int> &distances, | ||||||
| 		   Parameters p=Parameters(), | 		   Parameters p=Parameters()) | ||||||
| 		   bool preserve_shm=false) |  | ||||||
|   { |   { | ||||||
|     face_table_computed=0; |     face_table_computed=0; | ||||||
|     _grid    = grid; |     _grid    = grid; | ||||||
| @@ -832,9 +855,7 @@ public: | |||||||
|     ///////////////////////////////////////////////////////////////////////////////// |     ///////////////////////////////////////////////////////////////////////////////// | ||||||
|     const int Nsimd = grid->Nsimd(); |     const int Nsimd = grid->Nsimd(); | ||||||
|  |  | ||||||
|     // Allow for multiple stencils to exist simultaneously |     _grid->ShmBufferFreeAll(); | ||||||
|     if (!preserve_shm) |  | ||||||
|       _grid->ShmBufferFreeAll(); |  | ||||||
|  |  | ||||||
|     int maxl=2; |     int maxl=2; | ||||||
|     u_simd_send_buf.resize(maxl); |     u_simd_send_buf.resize(maxl); | ||||||
| @@ -1200,6 +1221,7 @@ public: | |||||||
| 	  /////////////////////////////////////////////////////////// | 	  /////////////////////////////////////////////////////////// | ||||||
| 	  int do_send = (comms_send|comms_partial_send) && (!shm_send ); | 	  int do_send = (comms_send|comms_partial_send) && (!shm_send ); | ||||||
| 	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv ); | 	  int do_recv = (comms_send|comms_partial_send) && (!shm_recv ); | ||||||
|  | 	   | ||||||
| 	  AddPacket((void *)&send_buf[comm_off], | 	  AddPacket((void *)&send_buf[comm_off], | ||||||
| 		    (void *)&recv_buf[comm_off], | 		    (void *)&recv_buf[comm_off], | ||||||
| 		    xmit_to_rank, do_send, | 		    xmit_to_rank, do_send, | ||||||
|   | |||||||
| @@ -69,35 +69,6 @@ accelerator_inline auto trace(const iVector<vtype,N> &arg) -> iVector<decltype(t | |||||||
|   } |   } | ||||||
|   return ret; |   return ret; | ||||||
| } | } | ||||||
| //////////////////////////// |  | ||||||
| // Fast path traceProduct |  | ||||||
| //////////////////////////// |  | ||||||
| template<class S1 , class S2, IfNotGridTensor<S1> = 0, IfNotGridTensor<S2> = 0> |  | ||||||
| accelerator_inline auto traceProduct( const S1 &arg1,const S2 &arg2) |  | ||||||
|   -> decltype(arg1*arg2) |  | ||||||
| { |  | ||||||
|   return arg1*arg2; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<class vtype,class rtype,int N > |  | ||||||
| accelerator_inline auto traceProduct(const iMatrix<vtype,N> &arg1,const iMatrix<rtype,N> &arg2) -> iScalar<decltype(trace(arg1._internal[0][0]*arg2._internal[0][0]))> |  | ||||||
| { |  | ||||||
|   iScalar<decltype( trace(arg1._internal[0][0]*arg2._internal[0][0] )) > ret; |  | ||||||
|   zeroit(ret._internal); |  | ||||||
|   for(int i=0;i<N;i++){ |  | ||||||
|   for(int j=0;j<N;j++){ |  | ||||||
|     ret._internal=ret._internal+traceProduct(arg1._internal[i][j],arg2._internal[j][i]); |  | ||||||
|   }} |  | ||||||
|   return ret; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| template<class vtype,class rtype > |  | ||||||
| accelerator_inline auto traceProduct(const iScalar<vtype> &arg1,const iScalar<rtype> &arg2) -> iScalar<decltype(trace(arg1._internal*arg2._internal))> |  | ||||||
| { |  | ||||||
|   iScalar<decltype(trace(arg1._internal*arg2._internal))> ret; |  | ||||||
|   ret._internal=traceProduct(arg1._internal,arg2._internal); |  | ||||||
|   return ret; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -34,12 +34,9 @@ NAMESPACE_BEGIN(Grid); | |||||||
|  |  | ||||||
|   // These are the Grid tensors |   // These are the Grid tensors | ||||||
|   template<typename T>     struct isGridTensor                : public std::false_type { static constexpr bool notvalue = true; }; |   template<typename T>     struct isGridTensor                : public std::false_type { static constexpr bool notvalue = true; }; | ||||||
|   template<class T>        struct isGridTensor<iScalar<T> >   : public std::true_type  { static constexpr bool notvalue = false; }; |   template<class T>        struct isGridTensor<iScalar<T>>    : public std::true_type  { static constexpr bool notvalue = false; }; | ||||||
|   template<class T, int N> struct isGridTensor<iVector<T, N> >: public std::true_type  { static constexpr bool notvalue = false; }; |   template<class T, int N> struct isGridTensor<iVector<T, N>> : public std::true_type  { static constexpr bool notvalue = false; }; | ||||||
|   template<class T, int N> struct isGridTensor<iMatrix<T, N> >: public std::true_type  { static constexpr bool notvalue = false; }; |   template<class T, int N> struct isGridTensor<iMatrix<T, N>> : public std::true_type  { static constexpr bool notvalue = false; }; | ||||||
|  |  | ||||||
|   template <typename T>  using IfGridTensor    = Invoke<std::enable_if<isGridTensor<T>::value, int> >; |  | ||||||
|   template <typename T>  using IfNotGridTensor = Invoke<std::enable_if<!isGridTensor<T>::value, int> >; |  | ||||||
|  |  | ||||||
|   // Traits to identify scalars |   // Traits to identify scalars | ||||||
|   template<typename T>     struct isGridScalar                : public std::false_type { static constexpr bool notvalue = true; }; |   template<typename T>     struct isGridScalar                : public std::false_type { static constexpr bool notvalue = true; }; | ||||||
| @@ -404,5 +401,3 @@ NAMESPACE_BEGIN(Grid); | |||||||
|   }; |   }; | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -7,8 +7,6 @@ uint32_t accelerator_threads=2; | |||||||
| uint32_t acceleratorThreads(void)       {return accelerator_threads;}; | uint32_t acceleratorThreads(void)       {return accelerator_threads;}; | ||||||
| void     acceleratorThreads(uint32_t t) {accelerator_threads = t;}; | void     acceleratorThreads(uint32_t t) {accelerator_threads = t;}; | ||||||
|  |  | ||||||
| #define ENV_LOCAL_RANK_PALS    "PALS_LOCAL_RANKID" |  | ||||||
| #define ENV_RANK_PALS          "PALS_RANKID" |  | ||||||
| #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK" | #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK" | ||||||
| #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK" | #define ENV_RANK_OMPI          "OMPI_COMM_WORLD_RANK" | ||||||
| #define ENV_LOCAL_RANK_SLURM   "SLURM_LOCALID" | #define ENV_LOCAL_RANK_SLURM   "SLURM_LOCALID" | ||||||
| @@ -149,7 +147,7 @@ void acceleratorInit(void) | |||||||
| #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); | #define GPU_PROP_FMT(canMapHostMemory,FMT)     printf("AcceleratorHipInit:   " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory); | ||||||
| #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d"); | #define GPU_PROP(canMapHostMemory)             GPU_PROP_FMT(canMapHostMemory,"%d"); | ||||||
|      |      | ||||||
|     auto r=hipGetDeviceProperties(&gpu_props[i], i); |     hipGetDeviceProperties(&gpu_props[i], i); | ||||||
|     hipDeviceProp_t prop;  |     hipDeviceProp_t prop;  | ||||||
|     prop = gpu_props[i]; |     prop = gpu_props[i]; | ||||||
|     totalDeviceMem = prop.totalGlobalMem; |     totalDeviceMem = prop.totalGlobalMem; | ||||||
| @@ -210,8 +208,8 @@ void acceleratorInit(void) | |||||||
|   cl::sycl::gpu_selector selector; |   cl::sycl::gpu_selector selector; | ||||||
|   cl::sycl::device selectedDevice { selector }; |   cl::sycl::device selectedDevice { selector }; | ||||||
|   theGridAccelerator = new sycl::queue (selectedDevice); |   theGridAccelerator = new sycl::queue (selectedDevice); | ||||||
|   theCopyAccelerator = new sycl::queue (selectedDevice); |   //  theCopyAccelerator = new sycl::queue (selectedDevice); | ||||||
|   //  theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. |   theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway. | ||||||
|  |  | ||||||
| #ifdef GRID_SYCL_LEVEL_ZERO_IPC | #ifdef GRID_SYCL_LEVEL_ZERO_IPC | ||||||
|   zeInit(0); |   zeInit(0); | ||||||
| @@ -230,17 +228,8 @@ void acceleratorInit(void) | |||||||
|   { |   { | ||||||
|     rank = atoi(localRankStr);		 |     rank = atoi(localRankStr);		 | ||||||
|   } |   } | ||||||
|   if ((localRankStr = getenv(ENV_LOCAL_RANK_PALS)) != NULL) |  | ||||||
|   { |  | ||||||
|     rank = atoi(localRankStr);		 |  | ||||||
|   } |  | ||||||
|   if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);} |   if ((localRankStr = getenv(ENV_RANK_OMPI   )) != NULL) { world_rank = atoi(localRankStr);} | ||||||
|   if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} |   if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} | ||||||
|   if ((localRankStr = getenv(ENV_RANK_PALS   )) != NULL) { world_rank = atoi(localRankStr);} |  | ||||||
|  |  | ||||||
|   char hostname[HOST_NAME_MAX+1]; |  | ||||||
|   gethostname(hostname, HOST_NAME_MAX+1); |  | ||||||
|   if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname); |  | ||||||
|  |  | ||||||
|   auto devices = cl::sycl::device::get_devices(); |   auto devices = cl::sycl::device::get_devices(); | ||||||
|   for(int d = 0;d<devices.size();d++){ |   for(int d = 0;d<devices.size();d++){ | ||||||
| @@ -252,10 +241,9 @@ void acceleratorInit(void) | |||||||
|     printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>()); |     printf("AcceleratorSyclInit:   " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>()); | ||||||
|  |  | ||||||
| #define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld"); | #define GPU_PROP(prop)             GPU_PROP_FMT(prop,"%ld"); | ||||||
|     if ( world_rank == 0) { |  | ||||||
|  |  | ||||||
|       GPU_PROP_STR(vendor); |     GPU_PROP_STR(vendor); | ||||||
|       GPU_PROP_STR(version); |     GPU_PROP_STR(version); | ||||||
|     //    GPU_PROP_STR(device_type); |     //    GPU_PROP_STR(device_type); | ||||||
|     /* |     /* | ||||||
|     GPU_PROP(max_compute_units); |     GPU_PROP(max_compute_units); | ||||||
| @@ -271,8 +259,7 @@ void acceleratorInit(void) | |||||||
|     GPU_PROP(single_fp_config); |     GPU_PROP(single_fp_config); | ||||||
|     */ |     */ | ||||||
|     //    GPU_PROP(double_fp_config); |     //    GPU_PROP(double_fp_config); | ||||||
|       GPU_PROP(global_mem_size); |     GPU_PROP(global_mem_size); | ||||||
|     } |  | ||||||
|  |  | ||||||
|   } |   } | ||||||
|   if ( world_rank == 0 ) { |   if ( world_rank == 0 ) { | ||||||
|   | |||||||
| @@ -225,8 +225,6 @@ inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; | |||||||
| inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; | inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; | ||||||
| inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} | inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} | ||||||
| inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} | inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} | ||||||
| inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);} |  | ||||||
| inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);} |  | ||||||
| inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);} | inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);} | ||||||
| inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch | inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch | ||||||
| { | { | ||||||
| @@ -255,13 +253,17 @@ inline int  acceleratorIsCommunicable(void *ptr) | |||||||
| #define GRID_SYCL_LEVEL_ZERO_IPC | #define GRID_SYCL_LEVEL_ZERO_IPC | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  | #if 0 | ||||||
| // Force deterministic reductions | #include <CL/sycl.hpp> | ||||||
| #define SYCL_REDUCTION_DETERMINISTIC | #include <CL/sycl/usm.hpp> | ||||||
|  | #include <level_zero/ze_api.h> | ||||||
|  | #include <CL/sycl/backend/level_zero.hpp> | ||||||
|  | #else | ||||||
| #include <sycl/CL/sycl.hpp> | #include <sycl/CL/sycl.hpp> | ||||||
| #include <sycl/usm.hpp> | #include <sycl/usm.hpp> | ||||||
| #include <level_zero/ze_api.h> | #include <level_zero/ze_api.h> | ||||||
| #include <sycl/ext/oneapi/backend/level_zero.hpp> | #include <sycl/ext/oneapi/backend/level_zero.hpp> | ||||||
|  | #endif | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
| @@ -285,24 +287,23 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { | |||||||
|  |  | ||||||
| #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\ | #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... )	\ | ||||||
|   theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\ |   theGridAccelerator->submit([&](cl::sycl::handler &cgh) {		\ | ||||||
|     unsigned long nt=acceleratorThreads();				\ |       unsigned long nt=acceleratorThreads();				\ | ||||||
|     if(nt < 8)nt=8;							\ |       unsigned long unum1 = num1;					\ | ||||||
|     unsigned long unum1 = num1;						\ |       unsigned long unum2 = num2;					\ | ||||||
|     unsigned long unum2 = num2;						\ |       if(nt < 8)nt=8;							\ | ||||||
|     unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt;	\ |       cl::sycl::range<3> local {nt,1,nsimd};				\ | ||||||
|     cl::sycl::range<3> local {nt,1,nsimd};				\ |       cl::sycl::range<3> global{unum1,unum2,nsimd};			\ | ||||||
|     cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd};	\ |       cgh.parallel_for(					\ | ||||||
|     cgh.parallel_for(							\ |       cl::sycl::nd_range<3>(global,local), \ | ||||||
| 		     cl::sycl::nd_range<3>(global,local),		\ |       [=] (cl::sycl::nd_item<3> item) /*mutable*/     \ | ||||||
| 		     [=] (cl::sycl::nd_item<3> item) /*mutable*/	\ |       [[intel::reqd_sub_group_size(16)]]	      \ | ||||||
| 		     [[intel::reqd_sub_group_size(16)]]			\ |       {						      \ | ||||||
| 		     {							\ |       auto iter1    = item.get_global_id(0);	      \ | ||||||
| 		       auto iter1    = item.get_global_id(0);		\ |       auto iter2    = item.get_global_id(1);	      \ | ||||||
| 		       auto iter2    = item.get_global_id(1);		\ |       auto lane     = item.get_global_id(2);	      \ | ||||||
| 		       auto lane     = item.get_global_id(2);		\ |       { __VA_ARGS__ };				      \ | ||||||
| 		       { if (iter1 < unum1){ __VA_ARGS__ } };		\ |      });	   			              \ | ||||||
| 		     });						\ |     }); | ||||||
|   }); |  | ||||||
|  |  | ||||||
| #define accelerator_barrier(dummy) { theGridAccelerator->wait(); } | #define accelerator_barrier(dummy) { theGridAccelerator->wait(); } | ||||||
|  |  | ||||||
| @@ -404,7 +405,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda) | |||||||
|  |  | ||||||
| #define accelerator_barrier(dummy)				\ | #define accelerator_barrier(dummy)				\ | ||||||
|   {								\ |   {								\ | ||||||
|     auto r=hipStreamSynchronize(computeStream);			\ |     hipStreamSynchronize(computeStream);			\ | ||||||
|     auto err = hipGetLastError();				\ |     auto err = hipGetLastError();				\ | ||||||
|     if ( err != hipSuccess ) {					\ |     if ( err != hipSuccess ) {					\ | ||||||
|       printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \ |       printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \ | ||||||
| @@ -437,21 +438,19 @@ inline void *acceleratorAllocDevice(size_t bytes) | |||||||
|   return ptr; |   return ptr; | ||||||
| }; | }; | ||||||
|  |  | ||||||
| inline void acceleratorFreeShared(void *ptr){ auto r=hipFree(ptr);}; | inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);}; | ||||||
| inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);}; | inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; | ||||||
| inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { auto r=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} | inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} | ||||||
| inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto r=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} | inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} | ||||||
| inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);} |  | ||||||
| inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) { auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);} |  | ||||||
| //inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} | //inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} | ||||||
| //inline void acceleratorCopySynchronise(void) {  } | //inline void acceleratorCopySynchronise(void) {  } | ||||||
| inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto r=hipMemset(base,value,bytes);} | inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);} | ||||||
|  |  | ||||||
| inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch | inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch | ||||||
| { | { | ||||||
|   auto r=hipMemcpyDtoDAsync(to,from,bytes, copyStream); |   hipMemcpyDtoDAsync(to,from,bytes, copyStream); | ||||||
| } | } | ||||||
| inline void acceleratorCopySynchronise(void) { auto r=hipStreamSynchronize(copyStream); }; | inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream); }; | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| @@ -576,11 +575,4 @@ accelerator_inline void acceleratorFence(void) | |||||||
|   return; |   return; | ||||||
| } | } | ||||||
|  |  | ||||||
| inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) |  | ||||||
| { |  | ||||||
|   acceleratorCopyDeviceToDeviceAsynch(from,to,bytes); |  | ||||||
|   acceleratorCopySynchronise(); |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|   | |||||||
| @@ -1,336 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid |  | ||||||
|  |  | ||||||
|     Source file: ./lib/Init.cc |  | ||||||
|  |  | ||||||
|     Copyright (C) 2015 |  | ||||||
|  |  | ||||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> |  | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |  | ||||||
| Author: Peter Boyle <peterboyle@MacBook-Pro.local> |  | ||||||
| Author: paboyle <paboyle@ph.ed.ac.uk> |  | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |  | ||||||
|     it under the terms of the GNU General Public License as published by |  | ||||||
|     the Free Software Foundation; either version 2 of the License, or |  | ||||||
|     (at your option) any later version. |  | ||||||
|  |  | ||||||
|     This program is distributed in the hope that it will be useful, |  | ||||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|     GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|     You should have received a copy of the GNU General Public License along |  | ||||||
|     with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
| #include <Grid/Grid.h> |  | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); |  | ||||||
| /////////////////////////////////////////////////////// |  | ||||||
| // Grid Norm logging for repro testing |  | ||||||
| /////////////////////////////////////////////////////// |  | ||||||
| int FlightRecorder::PrintEntireLog; |  | ||||||
| int FlightRecorder::ContinueOnFail; |  | ||||||
| int FlightRecorder::LoggingMode; |  | ||||||
| int FlightRecorder::ChecksumComms; |  | ||||||
| int FlightRecorder::ChecksumCommsSend; |  | ||||||
| int32_t  FlightRecorder::XmitLoggingCounter; |  | ||||||
| int32_t  FlightRecorder::RecvLoggingCounter; |  | ||||||
| int32_t  FlightRecorder::CsumLoggingCounter; |  | ||||||
| int32_t  FlightRecorder::NormLoggingCounter; |  | ||||||
| int32_t  FlightRecorder::ReductionLoggingCounter; |  | ||||||
| uint64_t FlightRecorder::ErrorCounter; |  | ||||||
| std::vector<double> FlightRecorder::NormLogVector; |  | ||||||
| std::vector<double> FlightRecorder::ReductionLogVector; |  | ||||||
| std::vector<uint64_t> FlightRecorder::CsumLogVector; |  | ||||||
| std::vector<uint64_t> FlightRecorder::XmitLogVector; |  | ||||||
| std::vector<uint64_t> FlightRecorder::RecvLogVector; |  | ||||||
|  |  | ||||||
| void FlightRecorder::ResetCounters(void) |  | ||||||
| { |  | ||||||
|   XmitLoggingCounter=0; |  | ||||||
|   RecvLoggingCounter=0; |  | ||||||
|   CsumLoggingCounter=0; |  | ||||||
|   NormLoggingCounter=0; |  | ||||||
|   ReductionLoggingCounter=0; |  | ||||||
| } |  | ||||||
| void FlightRecorder::Truncate(void) |  | ||||||
| { |  | ||||||
|   ResetCounters(); |  | ||||||
|   XmitLogVector.resize(0); |  | ||||||
|   RecvLogVector.resize(0); |  | ||||||
|   NormLogVector.resize(0); |  | ||||||
|   CsumLogVector.resize(0); |  | ||||||
|   ReductionLogVector.resize(0); |  | ||||||
| } |  | ||||||
| void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode) |  | ||||||
| { |  | ||||||
|   switch ( mode ) { |  | ||||||
|   case LoggingModePrint: |  | ||||||
|     SetLoggingModePrint(); |  | ||||||
|     break; |  | ||||||
|   case LoggingModeRecord: |  | ||||||
|     SetLoggingModeRecord(); |  | ||||||
|     break; |  | ||||||
|   case LoggingModeVerify: |  | ||||||
|     SetLoggingModeVerify(); |  | ||||||
|     break; |  | ||||||
|   case LoggingModeNone: |  | ||||||
|     LoggingMode = mode; |  | ||||||
|     Truncate(); |  | ||||||
|     break; |  | ||||||
|   default: |  | ||||||
|     assert(0); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void FlightRecorder::SetLoggingModePrint(void) |  | ||||||
| { |  | ||||||
|   std::cout << " FlightRecorder: set to print output " <<std::endl; |  | ||||||
|   Truncate(); |  | ||||||
|   LoggingMode = LoggingModePrint; |  | ||||||
| } |  | ||||||
| void FlightRecorder::SetLoggingModeRecord(void) |  | ||||||
| { |  | ||||||
|   std::cout << " FlightRecorder: set to RECORD " <<std::endl; |  | ||||||
|   Truncate(); |  | ||||||
|   LoggingMode = LoggingModeRecord; |  | ||||||
| } |  | ||||||
| void FlightRecorder::SetLoggingModeVerify(void) |  | ||||||
| { |  | ||||||
|   std::cout << " FlightRecorder: set to VERIFY " << NormLogVector.size()<< " log entries "<<std::endl; |  | ||||||
|   ResetCounters(); |  | ||||||
|   LoggingMode = LoggingModeVerify; |  | ||||||
| } |  | ||||||
| uint64_t FlightRecorder::ErrorCount(void) |  | ||||||
| { |  | ||||||
|   return ErrorCounter; |  | ||||||
| } |  | ||||||
| void FlightRecorder::NormLog(double value) |  | ||||||
| { |  | ||||||
|   uint64_t hex = * ( (uint64_t *)&value ); |  | ||||||
|   if(LoggingMode == LoggingModePrint) { |  | ||||||
|     std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; |  | ||||||
|     NormLoggingCounter++; |  | ||||||
|   } |  | ||||||
|   if(LoggingMode == LoggingModeRecord) { |  | ||||||
|     std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; |  | ||||||
|     NormLogVector.push_back(value); |  | ||||||
|     NormLoggingCounter++; |  | ||||||
|   } |  | ||||||
|   if(LoggingMode == LoggingModeVerify) { |  | ||||||
|  |  | ||||||
|     if(NormLoggingCounter < NormLogVector.size()){ |  | ||||||
|       uint64_t hexref  = * ( (uint64_t *)&NormLogVector[NormLoggingCounter] ); |  | ||||||
|  |  | ||||||
|       if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) { |  | ||||||
|  |  | ||||||
| 	std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter |  | ||||||
| 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" " |  | ||||||
| 		 <<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl; |  | ||||||
|  |  | ||||||
| 	std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<NormLogVector[NormLoggingCounter] <<std::endl; |  | ||||||
|  |  | ||||||
| 	fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e expect %.16e\n", |  | ||||||
| 		GridHostname(), |  | ||||||
| 		GlobalSharedMemory::WorldShmRank, |  | ||||||
| 		NormLoggingCounter,NormLogVector.size(), |  | ||||||
| 		value, NormLogVector[NormLoggingCounter]); fflush(stderr); |  | ||||||
|  |  | ||||||
| 	if(!ContinueOnFail)assert(0); // Force takedown of job |  | ||||||
| 	   |  | ||||||
| 	ErrorCounter++; |  | ||||||
|       } else { |  | ||||||
| 	if ( PrintEntireLog ) {  |  | ||||||
| 	  std::cerr<<"FlightRecorder::NormLog VALID "<< NormLoggingCounter << std::hex |  | ||||||
| 		   <<" "<<hex<<" "<<hexref |  | ||||||
| 		   <<" "<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::dec<<std::endl; |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
|         |  | ||||||
|     } |  | ||||||
|     if ( NormLogVector.size()==NormLoggingCounter ) { |  | ||||||
|       std::cout << "FlightRecorder:: Verified entire sequence of "<<NormLoggingCounter<<" norms "<<std::endl; |  | ||||||
|     } |  | ||||||
|     NormLoggingCounter++; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| void FlightRecorder::CsumLog(uint64_t hex) |  | ||||||
| { |  | ||||||
|   if(LoggingMode == LoggingModePrint) { |  | ||||||
|     std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; |  | ||||||
|     CsumLoggingCounter++; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   if(LoggingMode == LoggingModeRecord) { |  | ||||||
|     std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl; |  | ||||||
|     CsumLogVector.push_back(hex); |  | ||||||
|     CsumLoggingCounter++; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   if(LoggingMode == LoggingModeVerify) { |  | ||||||
|      |  | ||||||
|     if(CsumLoggingCounter < CsumLogVector.size()) { |  | ||||||
|  |  | ||||||
|       uint64_t hexref  = CsumLogVector[CsumLoggingCounter] ; |  | ||||||
|  |  | ||||||
|       if ( hex != hexref ) { |  | ||||||
|  |  | ||||||
|         std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter |  | ||||||
| 		 <<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl; |  | ||||||
|  |  | ||||||
| 	fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for csum %d %lx expect %lx\n", |  | ||||||
| 		GridHostname(), |  | ||||||
| 		GlobalSharedMemory::WorldShmRank, |  | ||||||
| 		CsumLoggingCounter,hex, hexref); |  | ||||||
| 	fflush(stderr); |  | ||||||
|  |  | ||||||
| 	if(!ContinueOnFail) assert(0); // Force takedown of job |  | ||||||
| 	   |  | ||||||
| 	ErrorCounter++; |  | ||||||
|  |  | ||||||
|       } else { |  | ||||||
|  |  | ||||||
| 	if ( PrintEntireLog ) {  |  | ||||||
| 	  std::cerr<<"FlightRecorder::CsumLog VALID "<< CsumLoggingCounter << std::hex |  | ||||||
| 		   <<" "<<hex<<" "<<hexref<<std::dec<<std::endl; |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
|     }   |  | ||||||
|     if ( CsumLogVector.size()==CsumLoggingCounter ) { |  | ||||||
|       std::cout << "FlightRecorder:: Verified entire sequence of "<<CsumLoggingCounter<<" checksums "<<std::endl; |  | ||||||
|     } |  | ||||||
|     CsumLoggingCounter++; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| void FlightRecorder::ReductionLog(double local,double global) |  | ||||||
| { |  | ||||||
|   uint64_t hex_l = * ( (uint64_t *)&local ); |  | ||||||
|   uint64_t hex_g = * ( (uint64_t *)&global ); |  | ||||||
|   if(LoggingMode == LoggingModePrint) { |  | ||||||
|     std::cerr<<"FlightRecorder::ReductionLog : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl; |  | ||||||
|     ReductionLoggingCounter++; |  | ||||||
|   } |  | ||||||
|   if(LoggingMode == LoggingModeRecord) { |  | ||||||
|     std::cerr<<"FlightRecorder::ReductionLog RECORDING : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl; |  | ||||||
|     ReductionLogVector.push_back(global); |  | ||||||
|     ReductionLoggingCounter++; |  | ||||||
|   } |  | ||||||
|   if(LoggingMode == LoggingModeVerify) { |  | ||||||
|     if(ReductionLoggingCounter < ReductionLogVector.size()){ |  | ||||||
|       if ( global != ReductionLogVector[ReductionLoggingCounter] ) { |  | ||||||
| 	fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n", |  | ||||||
| 		GridHostname(), |  | ||||||
| 		GlobalSharedMemory::WorldShmRank, |  | ||||||
| 		ReductionLoggingCounter,ReductionLogVector.size(), |  | ||||||
| 		global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr); |  | ||||||
| 	 |  | ||||||
| 	if ( !ContinueOnFail ) assert(0); |  | ||||||
|  |  | ||||||
| 	ErrorCounter++; |  | ||||||
|       } else { |  | ||||||
| 	if ( PrintEntireLog ) {  |  | ||||||
| 	  std::cerr<<"FlightRecorder::ReductionLog : VALID "<< ReductionLoggingCounter <<" "<< std::hexfloat << local << "-> "<< global <<std::endl; |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     if ( ReductionLogVector.size()==ReductionLoggingCounter ) { |  | ||||||
|       std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<ReductionLoggingCounter<<" norms "<<std::endl; |  | ||||||
|     } |  | ||||||
|     ReductionLoggingCounter++; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| void FlightRecorder::xmitLog(void *buf,uint64_t bytes) |  | ||||||
| { |  | ||||||
|   if(LoggingMode == LoggingModeNone) return; |  | ||||||
|  |  | ||||||
|   if ( ChecksumCommsSend ){ |  | ||||||
|   uint64_t *ubuf = (uint64_t *)buf; |  | ||||||
|   if(LoggingMode == LoggingModeNone) return; |  | ||||||
|    |  | ||||||
| #ifdef GRID_SYCL |  | ||||||
|   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); |  | ||||||
|   if(LoggingMode == LoggingModePrint) { |  | ||||||
|     std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; |  | ||||||
|     XmitLoggingCounter++; |  | ||||||
|   } |  | ||||||
|   if(LoggingMode == LoggingModeRecord) { |  | ||||||
|     std::cerr<<"FlightRecorder::xmitLog RECORD : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; |  | ||||||
|     XmitLogVector.push_back(_xor); |  | ||||||
|     XmitLoggingCounter++; |  | ||||||
|   } |  | ||||||
|   if(LoggingMode == LoggingModeVerify) { |  | ||||||
|     if(XmitLoggingCounter < XmitLogVector.size()){ |  | ||||||
|       if ( _xor != XmitLogVector[XmitLoggingCounter] ) { |  | ||||||
| 	fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu  %lx expect glb %lx\n", |  | ||||||
| 		GridHostname(), |  | ||||||
| 		GlobalSharedMemory::WorldShmRank, |  | ||||||
| 		XmitLoggingCounter,XmitLogVector.size(), |  | ||||||
| 		_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr); |  | ||||||
| 	 |  | ||||||
| 	if ( !ContinueOnFail ) assert(0); |  | ||||||
|  |  | ||||||
| 	ErrorCounter++; |  | ||||||
|       } else { |  | ||||||
| 	if ( PrintEntireLog ) {  |  | ||||||
| 	  std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<<  XmitLogVector[XmitLoggingCounter] <<std::endl; |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     if ( XmitLogVector.size()==XmitLoggingCounter ) { |  | ||||||
|       std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<XmitLoggingCounter<<" sends "<<std::endl; |  | ||||||
|     } |  | ||||||
|     XmitLoggingCounter++; |  | ||||||
|   } |  | ||||||
| #endif |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank) |  | ||||||
| { |  | ||||||
|   if ( ChecksumComms ){ |  | ||||||
|   uint64_t *ubuf = (uint64_t *)buf; |  | ||||||
|   if(LoggingMode == LoggingModeNone) return; |  | ||||||
| #ifdef GRID_SYCL |  | ||||||
|   uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t)); |  | ||||||
|   if(LoggingMode == LoggingModePrint) { |  | ||||||
|     std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; |  | ||||||
|     RecvLoggingCounter++; |  | ||||||
|   } |  | ||||||
|   if(LoggingMode == LoggingModeRecord) { |  | ||||||
|     std::cerr<<"FlightRecorder::recvLog RECORD : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl; |  | ||||||
|     RecvLogVector.push_back(_xor); |  | ||||||
|     RecvLoggingCounter++; |  | ||||||
|   } |  | ||||||
|   if(LoggingMode == LoggingModeVerify) { |  | ||||||
|     if(RecvLoggingCounter < RecvLogVector.size()){ |  | ||||||
|       if ( _xor != RecvLogVector[RecvLoggingCounter] ) { |  | ||||||
| 	fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu  %lx expect glb %lx from MPI rank %d\n", |  | ||||||
| 		GridHostname(), |  | ||||||
| 		GlobalSharedMemory::WorldShmRank, |  | ||||||
| 		RecvLoggingCounter,RecvLogVector.size(), |  | ||||||
| 		_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr); |  | ||||||
| 	 |  | ||||||
| 	if ( !ContinueOnFail ) assert(0); |  | ||||||
|  |  | ||||||
| 	ErrorCounter++; |  | ||||||
|       } else { |  | ||||||
| 	if ( PrintEntireLog ) {  |  | ||||||
| 	  std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<<  RecvLogVector[RecvLoggingCounter] <<std::endl; |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     if ( RecvLogVector.size()==RecvLoggingCounter ) { |  | ||||||
|       std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<RecvLoggingCounter<<" sends "<<std::endl; |  | ||||||
|     } |  | ||||||
|     RecvLoggingCounter++; |  | ||||||
|   } |  | ||||||
| #endif |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); |  | ||||||
| @@ -1,43 +0,0 @@ | |||||||
| #pragma once |  | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); |  | ||||||
| class FlightRecorder { |  | ||||||
|  public: |  | ||||||
|   enum LoggingMode_t { |  | ||||||
|     LoggingModeNone, |  | ||||||
|     LoggingModePrint, |  | ||||||
|     LoggingModeRecord, |  | ||||||
|     LoggingModeVerify |  | ||||||
|   }; |  | ||||||
|    |  | ||||||
|   static int                   LoggingMode; |  | ||||||
|   static uint64_t              ErrorCounter; |  | ||||||
|   static int32_t               XmitLoggingCounter; |  | ||||||
|   static int32_t               RecvLoggingCounter; |  | ||||||
|   static int32_t               CsumLoggingCounter; |  | ||||||
|   static int32_t               NormLoggingCounter; |  | ||||||
|   static int32_t               ReductionLoggingCounter; |  | ||||||
|   static std::vector<uint64_t> XmitLogVector; |  | ||||||
|   static std::vector<uint64_t> RecvLogVector; |  | ||||||
|   static std::vector<uint64_t> CsumLogVector; |  | ||||||
|   static std::vector<double>   NormLogVector; |  | ||||||
|   static std::vector<double>   ReductionLogVector; |  | ||||||
|   static int ContinueOnFail; |  | ||||||
|   static int PrintEntireLog; |  | ||||||
|   static int ChecksumComms; |  | ||||||
|   static int ChecksumCommsSend; |  | ||||||
|   static void SetLoggingModePrint(void); |  | ||||||
|   static void SetLoggingModeRecord(void); |  | ||||||
|   static void SetLoggingModeVerify(void); |  | ||||||
|   static void SetLoggingMode(LoggingMode_t mode); |  | ||||||
|   static void NormLog(double value); |  | ||||||
|   static void CsumLog(uint64_t csum); |  | ||||||
|   static void ReductionLog(double lcl, double glbl); |  | ||||||
|   static void Truncate(void); |  | ||||||
|   static void ResetCounters(void); |  | ||||||
|   static uint64_t ErrorCount(void); |  | ||||||
|   static void xmitLog(void *,uint64_t bytes); |  | ||||||
|   static void recvLog(void *,uint64_t bytes,int rank); |  | ||||||
| }; |  | ||||||
| NAMESPACE_END(Grid); |  | ||||||
|  |  | ||||||
| @@ -77,10 +77,6 @@ feenableexcept (unsigned int excepts) | |||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #ifndef HOST_NAME_MAX |  | ||||||
| #define HOST_NAME_MAX _POSIX_HOST_NAME_MAX |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| NAMESPACE_BEGIN(Grid); | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
| ////////////////////////////////////////////////////// | ////////////////////////////////////////////////////// | ||||||
| @@ -94,12 +90,7 @@ int GridThread::_threads =1; | |||||||
| int GridThread::_hyperthreads=1; | int GridThread::_hyperthreads=1; | ||||||
| int GridThread::_cores=1; | int GridThread::_cores=1; | ||||||
|  |  | ||||||
| char hostname[HOST_NAME_MAX+1]; |  | ||||||
|  |  | ||||||
| char *GridHostname(void) |  | ||||||
| { |  | ||||||
|   return hostname; |  | ||||||
| } |  | ||||||
| const Coordinate &GridDefaultLatt(void)     {return Grid_default_latt;}; | const Coordinate &GridDefaultLatt(void)     {return Grid_default_latt;}; | ||||||
| const Coordinate &GridDefaultMpi(void)      {return Grid_default_mpi;}; | const Coordinate &GridDefaultMpi(void)      {return Grid_default_mpi;}; | ||||||
| const Coordinate GridDefaultSimd(int dims,int nsimd) | const Coordinate GridDefaultSimd(int dims,int nsimd) | ||||||
| @@ -402,8 +393,6 @@ void Grid_init(int *argc,char ***argv) | |||||||
|   std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl; |   std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl; | ||||||
|   std::cout << GridLogMessage << "================================================ "<<std::endl; |   std::cout << GridLogMessage << "================================================ "<<std::endl; | ||||||
|  |  | ||||||
|   gethostname(hostname, HOST_NAME_MAX+1); |  | ||||||
|   std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl; |  | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////// | ||||||
|   // Reporting |   // Reporting | ||||||
|   | |||||||
| @@ -34,8 +34,6 @@ NAMESPACE_BEGIN(Grid); | |||||||
| void Grid_init(int *argc,char ***argv); | void Grid_init(int *argc,char ***argv); | ||||||
| void Grid_finalize(void); | void Grid_finalize(void); | ||||||
|  |  | ||||||
| char * GridHostname(void); |  | ||||||
|  |  | ||||||
| // internal, controled with --handle | // internal, controled with --handle | ||||||
| void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); | void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); | ||||||
| void Grid_debug_handler_init(void); | void Grid_debug_handler_init(void); | ||||||
| @@ -70,6 +68,5 @@ void GridParseLayout(char **argv,int argc, | |||||||
| void printHash(void); | void printHash(void); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| NAMESPACE_END(Grid); | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| #pragma once | #ifndef GRID_UTIL_H | ||||||
|  | #define GRID_UTIL_H | ||||||
| #include <Grid/util/Coordinate.h> | #include <Grid/util/Coordinate.h> | ||||||
| #include <Grid/util/Lexicographic.h> | #include <Grid/util/Lexicographic.h> | ||||||
| #include <Grid/util/Init.h> | #include <Grid/util/Init.h> | ||||||
| #include <Grid/util/FlightRecorder.h> | #endif | ||||||
|  |  | ||||||
|   | |||||||
| @@ -54,16 +54,15 @@ int main(int argc, char **argv) | |||||||
|   //  MD.name    = std::string("Force Gradient"); |   //  MD.name    = std::string("Force Gradient"); | ||||||
|   typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; |   typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; | ||||||
|   MD.name    = std::string("MinimumNorm2"); |   MD.name    = std::string("MinimumNorm2"); | ||||||
|   MD.MDsteps = 24; |   MD.MDsteps = 12; | ||||||
|   MD.trajL   = 1.0; |   MD.trajL   = 1.0; | ||||||
|  |  | ||||||
|   HMCparameters HMCparams; |   HMCparameters HMCparams; | ||||||
|   HMCparams.StartTrajectory  = 104; |   HMCparams.StartTrajectory  = 0; | ||||||
|   HMCparams.Trajectories     = 200; |   HMCparams.Trajectories     = 200; | ||||||
|   HMCparams.NoMetropolisUntil=  20; |   HMCparams.NoMetropolisUntil=  20; | ||||||
|   // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; |   // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; | ||||||
|   //  HMCparams.StartingType     =std::string("HotStart"); |   HMCparams.StartingType     =std::string("HotStart"); | ||||||
|   HMCparams.StartingType     =std::string("CheckpointStart"); |  | ||||||
|   HMCparams.MD = MD; |   HMCparams.MD = MD; | ||||||
|   HMCWrapper TheHMC(HMCparams); |   HMCWrapper TheHMC(HMCparams); | ||||||
|  |  | ||||||
| @@ -88,7 +87,6 @@ int main(int argc, char **argv) | |||||||
|   // here there is too much indirection |   // here there is too much indirection | ||||||
|   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs; |   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs; | ||||||
|   TheHMC.Resources.AddObservable<PlaqObs>(); |   TheHMC.Resources.AddObservable<PlaqObs>(); | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////// |   ////////////////////////////////////////////// | ||||||
|  |  | ||||||
|   const int Ls      = 16; |   const int Ls      = 16; | ||||||
| @@ -136,6 +134,7 @@ int main(int argc, char **argv) | |||||||
|   //////////////////////////////////// |   //////////////////////////////////// | ||||||
|   ActionLevel<HMCWrapper::Field> Level1(1); |   ActionLevel<HMCWrapper::Field> Level1(1); | ||||||
|   ActionLevel<HMCWrapper::Field> Level2(2); |   ActionLevel<HMCWrapper::Field> Level2(2); | ||||||
|  |   ActionLevel<HMCWrapper::Field> Level3(4); | ||||||
|  |  | ||||||
|   //////////////////////////////////// |   //////////////////////////////////// | ||||||
|   // Strange action |   // Strange action | ||||||
| @@ -192,7 +191,7 @@ int main(int argc, char **argv) | |||||||
|   Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho); |   Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho); | ||||||
|   SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout); |   SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout); | ||||||
|   JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy); |   JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy); | ||||||
|   if( ApplySmearing ) Level1.push_back(&Jacobian); |   if( ApplySmearing ) Level2.push_back(&Jacobian); | ||||||
|   std::cout << GridLogMessage << " Built the Jacobian "<< std::endl; |   std::cout << GridLogMessage << " Built the Jacobian "<< std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -201,7 +200,7 @@ int main(int argc, char **argv) | |||||||
|   ///////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////// | ||||||
|   //  GaugeAction.is_smeared = ApplySmearing; |   //  GaugeAction.is_smeared = ApplySmearing; | ||||||
|   GaugeAction.is_smeared = true; |   GaugeAction.is_smeared = true; | ||||||
|   Level2.push_back(&GaugeAction); |   Level3.push_back(&GaugeAction); | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << " ************************************************"<< std::endl; |   std::cout << GridLogMessage << " ************************************************"<< std::endl; | ||||||
|   std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl; |   std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl; | ||||||
| @@ -211,11 +210,10 @@ int main(int argc, char **argv) | |||||||
|  |  | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << " Running the FT HMC "<< std::endl; |   std::cout << GridLogMessage << " Running the FT HMC "<< std::endl; | ||||||
|  |  | ||||||
|   TheHMC.TheAction.push_back(Level1); |   TheHMC.TheAction.push_back(Level1); | ||||||
|   TheHMC.TheAction.push_back(Level2); |   TheHMC.TheAction.push_back(Level2); | ||||||
|  |   TheHMC.TheAction.push_back(Level3); | ||||||
|   TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file |  | ||||||
|   TheHMC.initializeGaugeFieldAndRNGs(U); |  | ||||||
|  |  | ||||||
|   TheHMC.Run(SmearingPolicy); // for smearing |   TheHMC.Run(SmearingPolicy); // for smearing | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,226 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
| Grid physics library, www.github.com/paboyle/Grid |  | ||||||
|  |  | ||||||
| Copyright (C) 2023 |  | ||||||
|  |  | ||||||
| Author: Peter Boyle <pabobyle@ph.ed.ac.uk> |  | ||||||
|  |  | ||||||
| This program is free software; you can redistribute it and/or modify |  | ||||||
| it under the terms of the GNU General Public License as published by |  | ||||||
| the Free Software Foundation; either version 2 of the License, or |  | ||||||
| (at your option) any later version. |  | ||||||
|  |  | ||||||
| This program is distributed in the hope that it will be useful, |  | ||||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| GNU General Public License for more details. |  | ||||||
|  |  | ||||||
| You should have received a copy of the GNU General Public License along |  | ||||||
| with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
| See the full license in the file "LICENSE" in the top level distribution |  | ||||||
| directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
| #include <Grid/Grid.h> |  | ||||||
| #include <Grid/qcd/smearing/GaugeConfigurationMasked.h> |  | ||||||
| #include <Grid/qcd/smearing/JacobianAction.h> |  | ||||||
|  |  | ||||||
| using namespace Grid; |  | ||||||
|  |  | ||||||
| int main(int argc, char **argv) |  | ||||||
| { |  | ||||||
|   std::cout << std::setprecision(12); |  | ||||||
|    |  | ||||||
|   Grid_init(&argc, &argv); |  | ||||||
|   int threads = GridThread::GetThreads(); |  | ||||||
|   // here make a routine to print all the relevant information on the run |  | ||||||
|   std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; |  | ||||||
|  |  | ||||||
|    // Typedefs to simplify notation |  | ||||||
|   typedef WilsonImplR FermionImplPolicy; |  | ||||||
|   typedef MobiusFermionD FermionAction; |  | ||||||
|   typedef typename FermionAction::FermionField FermionField; |  | ||||||
|  |  | ||||||
|   typedef Grid::XmlReader       Serialiser; |  | ||||||
|  |  | ||||||
|   //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: |  | ||||||
|   IntegratorParameters MD; |  | ||||||
|   //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; |  | ||||||
|   //  MD.name    = std::string("Leap Frog"); |  | ||||||
|   //  typedef GenericHMCRunner<ForceGradient> HMCWrapper; |  | ||||||
|   //  MD.name    = std::string("Force Gradient"); |  | ||||||
|   typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; |  | ||||||
|   MD.name    = std::string("MinimumNorm2"); |  | ||||||
|   MD.MDsteps = 24; |  | ||||||
|   MD.trajL   = 1.0; |  | ||||||
|  |  | ||||||
|   HMCparameters HMCparams; |  | ||||||
|   HMCparams.StartTrajectory  = 0; |  | ||||||
|   HMCparams.Trajectories     = 200; |  | ||||||
|   HMCparams.NoMetropolisUntil=  20; |  | ||||||
|   // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; |  | ||||||
|   //  HMCparams.StartingType     =std::string("HotStart"); |  | ||||||
|   HMCparams.StartingType     =std::string("ColdStart"); |  | ||||||
|   //  HMCparams.StartingType     =std::string("CheckpointStart"); |  | ||||||
|   HMCparams.MD = MD; |  | ||||||
|   HMCWrapper TheHMC(HMCparams); |  | ||||||
|  |  | ||||||
|   // Grid from the command line arguments --grid and --mpi |  | ||||||
|   TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition |  | ||||||
|  |  | ||||||
|   CheckpointerParameters CPparams; |  | ||||||
|   CPparams.config_prefix = "ckpoint_EODWF_lat"; |  | ||||||
|   CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr"; |  | ||||||
|   CPparams.rng_prefix    = "ckpoint_EODWF_rng"; |  | ||||||
|   CPparams.saveInterval  = 1; |  | ||||||
|   CPparams.saveSmeared   = true; |  | ||||||
|   CPparams.format        = "IEEE64BIG"; |  | ||||||
|   TheHMC.Resources.LoadNerscCheckpointer(CPparams); |  | ||||||
|  |  | ||||||
|   RNGModuleParameters RNGpar; |  | ||||||
|   RNGpar.serial_seeds = "1 2 3 4 5"; |  | ||||||
|   RNGpar.parallel_seeds = "6 7 8 9 10"; |  | ||||||
|   TheHMC.Resources.SetRNGSeeds(RNGpar); |  | ||||||
|  |  | ||||||
|   // Construct observables |  | ||||||
|   // here there is too much indirection |  | ||||||
|   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs; |  | ||||||
|   TheHMC.Resources.AddObservable<PlaqObs>(); |  | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////// |  | ||||||
|  |  | ||||||
|   const int Ls      = 12; |  | ||||||
|   Real beta         = 2.37; |  | ||||||
|   Real light_mass   = 0.0047; |  | ||||||
|   Real strange_mass = 0.0186; |  | ||||||
|   Real pv_mass      = 1.0; |  | ||||||
|   RealD M5  = 1.8; |  | ||||||
|   RealD b   = 1.0; // Scale factor one, Shamir |  | ||||||
|   RealD c   = 0.0; |  | ||||||
|  |  | ||||||
|   OneFlavourRationalParams OFRp; |  | ||||||
|   OFRp.lo       = 1.0e-2; |  | ||||||
|   OFRp.hi       = 64; |  | ||||||
|   OFRp.MaxIter  = 10000; |  | ||||||
|   OFRp.tolerance= 1.0e-10; |  | ||||||
|   OFRp.degree   = 14; |  | ||||||
|   OFRp.precision= 40; |  | ||||||
|  |  | ||||||
|   std::vector<Real> hasenbusch({ 0.05, 0.1, 0.25, 0.5 }); |  | ||||||
|  |  | ||||||
|   auto GridPtr   = TheHMC.Resources.GetCartesian(); |  | ||||||
|   auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); |  | ||||||
|   auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); |  | ||||||
|   auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); |  | ||||||
|  |  | ||||||
|   IwasakiGaugeActionR GaugeAction(beta); |  | ||||||
|  |  | ||||||
|   // temporarily need a gauge field |  | ||||||
|   LatticeGaugeField U(GridPtr); |  | ||||||
|   LatticeGaugeField Uhot(GridPtr); |  | ||||||
|  |  | ||||||
|   // These lines are unecessary if BC are all periodic |  | ||||||
|   std::vector<Complex> boundary = {1,1,1,-1}; |  | ||||||
|   FermionAction::ImplParams Params(boundary); |  | ||||||
|  |  | ||||||
|   double StoppingCondition = 1e-10; |  | ||||||
|   double MaxCGIterations = 30000; |  | ||||||
|   ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations); |  | ||||||
|  |  | ||||||
|   bool ApplySmearing = true; |  | ||||||
|    |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   // Collect actions |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   ActionLevel<HMCWrapper::Field> Level1(1); |  | ||||||
|   ActionLevel<HMCWrapper::Field> Level2(2); |  | ||||||
|  |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   // Strange action |  | ||||||
|   //////////////////////////////////// |  | ||||||
|  |  | ||||||
|   MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); |  | ||||||
|   MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c); |  | ||||||
|   ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>  |  | ||||||
|     EOFA(Strange_Op_L, Strange_Op_R,  |  | ||||||
| 	 CG, |  | ||||||
| 	 CG, CG, |  | ||||||
| 	 CG, CG,  |  | ||||||
| 	 OFRp, false); |  | ||||||
|  |  | ||||||
|   EOFA.is_smeared = ApplySmearing; |  | ||||||
|   Level1.push_back(&EOFA); |  | ||||||
|  |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   // up down action |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   std::vector<Real> light_den; |  | ||||||
|   std::vector<Real> light_num; |  | ||||||
|  |  | ||||||
|   int n_hasenbusch = hasenbusch.size(); |  | ||||||
|   light_den.push_back(light_mass); |  | ||||||
|   for(int h=0;h<n_hasenbusch;h++){ |  | ||||||
|     light_den.push_back(hasenbusch[h]); |  | ||||||
|     light_num.push_back(hasenbusch[h]); |  | ||||||
|   } |  | ||||||
|   light_num.push_back(pv_mass); |  | ||||||
|  |  | ||||||
|   std::vector<FermionAction *> Numerators; |  | ||||||
|   std::vector<FermionAction *> Denominators; |  | ||||||
|   std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients; |  | ||||||
|  |  | ||||||
|   for(int h=0;h<n_hasenbusch+1;h++){ |  | ||||||
|     std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl; |  | ||||||
|     Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params)); |  | ||||||
|     Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params)); |  | ||||||
|     Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG)); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   for(int h=0;h<n_hasenbusch+1;h++){ |  | ||||||
|     Quotients[h]->is_smeared = ApplySmearing; |  | ||||||
|     Level1.push_back(Quotients[h]); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////// |  | ||||||
|   // lnDetJacobianAction |  | ||||||
|   ///////////////////////////////////////////////////////////// |  | ||||||
|   double rho = 0.1;  // smearing parameter |  | ||||||
|   int Nsmear = 1;    // number of smearing levels - must be multiple of 2Nd |  | ||||||
|   int Nstep  = 8*Nsmear;    // number of smearing levels - must be multiple of 2Nd |  | ||||||
|   Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho); |  | ||||||
|   SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout); |  | ||||||
|   JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy); |  | ||||||
|   if( ApplySmearing ) Level1.push_back(&Jacobian); |  | ||||||
|   std::cout << GridLogMessage << " Built the Jacobian "<< std::endl; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////// |  | ||||||
|   // Gauge action |  | ||||||
|   ///////////////////////////////////////////////////////////// |  | ||||||
|   GaugeAction.is_smeared = ApplySmearing; |  | ||||||
|   Level2.push_back(&GaugeAction); |  | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << " ************************************************"<< std::endl; |  | ||||||
|   std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl; |  | ||||||
|   std::cout << GridLogMessage << " ************************************************"<< std::endl; |  | ||||||
|   std::cout << GridLogMessage <<  std::endl; |  | ||||||
|   std::cout << GridLogMessage <<  std::endl; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << " Running the FT HMC "<< std::endl; |  | ||||||
|   TheHMC.TheAction.push_back(Level1); |  | ||||||
|   TheHMC.TheAction.push_back(Level2); |  | ||||||
|  |  | ||||||
|   TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file |  | ||||||
|   TheHMC.initializeGaugeFieldAndRNGs(U); |  | ||||||
|  |  | ||||||
|   TheHMC.Run(SmearingPolicy); // for smearing |  | ||||||
|  |  | ||||||
|   Grid_finalize(); |  | ||||||
| } // main |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -1,226 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
| Grid physics library, www.github.com/paboyle/Grid |  | ||||||
|  |  | ||||||
| Copyright (C) 2023 |  | ||||||
|  |  | ||||||
| Author: Peter Boyle <pabobyle@ph.ed.ac.uk> |  | ||||||
|  |  | ||||||
| This program is free software; you can redistribute it and/or modify |  | ||||||
| it under the terms of the GNU General Public License as published by |  | ||||||
| the Free Software Foundation; either version 2 of the License, or |  | ||||||
| (at your option) any later version. |  | ||||||
|  |  | ||||||
| This program is distributed in the hope that it will be useful, |  | ||||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| GNU General Public License for more details. |  | ||||||
|  |  | ||||||
| You should have received a copy of the GNU General Public License along |  | ||||||
| with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
| See the full license in the file "LICENSE" in the top level distribution |  | ||||||
| directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
| #include <Grid/Grid.h> |  | ||||||
| #include <Grid/qcd/smearing/GaugeConfigurationMasked.h> |  | ||||||
| #include <Grid/qcd/smearing/JacobianAction.h> |  | ||||||
|  |  | ||||||
| using namespace Grid; |  | ||||||
|  |  | ||||||
| int main(int argc, char **argv) |  | ||||||
| { |  | ||||||
|   std::cout << std::setprecision(12); |  | ||||||
|    |  | ||||||
|   Grid_init(&argc, &argv); |  | ||||||
|   int threads = GridThread::GetThreads(); |  | ||||||
|   // here make a routine to print all the relevant information on the run |  | ||||||
|   std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; |  | ||||||
|  |  | ||||||
|    // Typedefs to simplify notation |  | ||||||
|   typedef WilsonImplR FermionImplPolicy; |  | ||||||
|   typedef MobiusFermionD FermionAction; |  | ||||||
|   typedef typename FermionAction::FermionField FermionField; |  | ||||||
|  |  | ||||||
|   typedef Grid::XmlReader       Serialiser; |  | ||||||
|  |  | ||||||
|   //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: |  | ||||||
|   IntegratorParameters MD; |  | ||||||
|   //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; |  | ||||||
|   //  MD.name    = std::string("Leap Frog"); |  | ||||||
|   //  typedef GenericHMCRunner<ForceGradient> HMCWrapper; |  | ||||||
|   //  MD.name    = std::string("Force Gradient"); |  | ||||||
|   typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; |  | ||||||
|   MD.name    = std::string("MinimumNorm2"); |  | ||||||
|   MD.MDsteps = 24; |  | ||||||
|   MD.trajL   = 1.0; |  | ||||||
|  |  | ||||||
|   HMCparameters HMCparams; |  | ||||||
|   HMCparams.StartTrajectory  = 0; |  | ||||||
|   HMCparams.Trajectories     = 200; |  | ||||||
|   HMCparams.NoMetropolisUntil=  20; |  | ||||||
|   // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; |  | ||||||
|   //  HMCparams.StartingType     =std::string("HotStart"); |  | ||||||
|   HMCparams.StartingType     =std::string("ColdStart"); |  | ||||||
|   //  HMCparams.StartingType     =std::string("CheckpointStart"); |  | ||||||
|   HMCparams.MD = MD; |  | ||||||
|   HMCWrapper TheHMC(HMCparams); |  | ||||||
|  |  | ||||||
|   // Grid from the command line arguments --grid and --mpi |  | ||||||
|   TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition |  | ||||||
|  |  | ||||||
|   CheckpointerParameters CPparams; |  | ||||||
|   CPparams.config_prefix = "ckpoint_EODWF_lat"; |  | ||||||
|   CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr"; |  | ||||||
|   CPparams.rng_prefix    = "ckpoint_EODWF_rng"; |  | ||||||
|   CPparams.saveInterval  = 1; |  | ||||||
|   CPparams.saveSmeared   = true; |  | ||||||
|   CPparams.format        = "IEEE64BIG"; |  | ||||||
|   TheHMC.Resources.LoadNerscCheckpointer(CPparams); |  | ||||||
|  |  | ||||||
|   RNGModuleParameters RNGpar; |  | ||||||
|   RNGpar.serial_seeds = "1 2 3 4 5"; |  | ||||||
|   RNGpar.parallel_seeds = "6 7 8 9 10"; |  | ||||||
|   TheHMC.Resources.SetRNGSeeds(RNGpar); |  | ||||||
|  |  | ||||||
|   // Construct observables |  | ||||||
|   // here there is too much indirection |  | ||||||
|   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs; |  | ||||||
|   TheHMC.Resources.AddObservable<PlaqObs>(); |  | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////// |  | ||||||
|  |  | ||||||
|   const int Ls      = 12; |  | ||||||
|   Real beta         = 2.37; |  | ||||||
|   Real light_mass   = 0.0047; |  | ||||||
|   Real strange_mass = 0.0186; |  | ||||||
|   Real pv_mass      = 1.0; |  | ||||||
|   RealD M5  = 1.8; |  | ||||||
|   RealD b   = 1.0; // Scale factor one, Shamir |  | ||||||
|   RealD c   = 0.0; |  | ||||||
|  |  | ||||||
|   OneFlavourRationalParams OFRp; |  | ||||||
|   OFRp.lo       = 1.0e-2; |  | ||||||
|   OFRp.hi       = 64; |  | ||||||
|   OFRp.MaxIter  = 10000; |  | ||||||
|   OFRp.tolerance= 1.0e-10; |  | ||||||
|   OFRp.degree   = 14; |  | ||||||
|   OFRp.precision= 40; |  | ||||||
|  |  | ||||||
|   std::vector<Real> hasenbusch({ 0.05, 0.1, 0.25, 0.5 }); |  | ||||||
|  |  | ||||||
|   auto GridPtr   = TheHMC.Resources.GetCartesian(); |  | ||||||
|   auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); |  | ||||||
|   auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); |  | ||||||
|   auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); |  | ||||||
|  |  | ||||||
|   IwasakiGaugeActionR GaugeAction(beta); |  | ||||||
|  |  | ||||||
|   // temporarily need a gauge field |  | ||||||
|   LatticeGaugeField U(GridPtr); |  | ||||||
|   LatticeGaugeField Uhot(GridPtr); |  | ||||||
|  |  | ||||||
|   // These lines are unecessary if BC are all periodic |  | ||||||
|   std::vector<Complex> boundary = {1,1,1,-1}; |  | ||||||
|   FermionAction::ImplParams Params(boundary); |  | ||||||
|  |  | ||||||
|   double StoppingCondition = 1e-10; |  | ||||||
|   double MaxCGIterations = 30000; |  | ||||||
|   ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations); |  | ||||||
|  |  | ||||||
|   bool ApplySmearing = false; |  | ||||||
|    |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   // Collect actions |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   ActionLevel<HMCWrapper::Field> Level1(1); |  | ||||||
|   ActionLevel<HMCWrapper::Field> Level2(2); |  | ||||||
|  |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   // Strange action |  | ||||||
|   //////////////////////////////////// |  | ||||||
|  |  | ||||||
|   MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); |  | ||||||
|   MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c); |  | ||||||
|   ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>  |  | ||||||
|     EOFA(Strange_Op_L, Strange_Op_R,  |  | ||||||
| 	 CG, |  | ||||||
| 	 CG, CG, |  | ||||||
| 	 CG, CG,  |  | ||||||
| 	 OFRp, false); |  | ||||||
|  |  | ||||||
|   EOFA.is_smeared = ApplySmearing; |  | ||||||
|   Level1.push_back(&EOFA); |  | ||||||
|  |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   // up down action |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   std::vector<Real> light_den; |  | ||||||
|   std::vector<Real> light_num; |  | ||||||
|  |  | ||||||
|   int n_hasenbusch = hasenbusch.size(); |  | ||||||
|   light_den.push_back(light_mass); |  | ||||||
|   for(int h=0;h<n_hasenbusch;h++){ |  | ||||||
|     light_den.push_back(hasenbusch[h]); |  | ||||||
|     light_num.push_back(hasenbusch[h]); |  | ||||||
|   } |  | ||||||
|   light_num.push_back(pv_mass); |  | ||||||
|  |  | ||||||
|   std::vector<FermionAction *> Numerators; |  | ||||||
|   std::vector<FermionAction *> Denominators; |  | ||||||
|   std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients; |  | ||||||
|  |  | ||||||
|   for(int h=0;h<n_hasenbusch+1;h++){ |  | ||||||
|     std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl; |  | ||||||
|     Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params)); |  | ||||||
|     Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params)); |  | ||||||
|     Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG)); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   for(int h=0;h<n_hasenbusch+1;h++){ |  | ||||||
|     Quotients[h]->is_smeared = ApplySmearing; |  | ||||||
|     Level1.push_back(Quotients[h]); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////// |  | ||||||
|   // lnDetJacobianAction |  | ||||||
|   ///////////////////////////////////////////////////////////// |  | ||||||
|   double rho = 0.1;  // smearing parameter |  | ||||||
|   int Nsmear = 1;    // number of smearing levels - must be multiple of 2Nd |  | ||||||
|   int Nstep  = 8*Nsmear;    // number of smearing levels - must be multiple of 2Nd |  | ||||||
|   Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho); |  | ||||||
|   SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout); |  | ||||||
|   JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy); |  | ||||||
|   if( ApplySmearing ) Level1.push_back(&Jacobian); |  | ||||||
|   std::cout << GridLogMessage << " Built the Jacobian "<< std::endl; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////// |  | ||||||
|   // Gauge action |  | ||||||
|   ///////////////////////////////////////////////////////////// |  | ||||||
|   GaugeAction.is_smeared = ApplySmearing; |  | ||||||
|   Level2.push_back(&GaugeAction); |  | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << " ************************************************"<< std::endl; |  | ||||||
|   std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl; |  | ||||||
|   std::cout << GridLogMessage << " ************************************************"<< std::endl; |  | ||||||
|   std::cout << GridLogMessage <<  std::endl; |  | ||||||
|   std::cout << GridLogMessage <<  std::endl; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << " Running the FT HMC "<< std::endl; |  | ||||||
|   TheHMC.TheAction.push_back(Level1); |  | ||||||
|   TheHMC.TheAction.push_back(Level2); |  | ||||||
|  |  | ||||||
|   TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file |  | ||||||
|   TheHMC.initializeGaugeFieldAndRNGs(U); |  | ||||||
|  |  | ||||||
|   TheHMC.Run(SmearingPolicy); // for smearing |  | ||||||
|  |  | ||||||
|   Grid_finalize(); |  | ||||||
| } // main |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -1,350 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
| Grid physics library, www.github.com/paboyle/Grid |  | ||||||
|  |  | ||||||
| Source file: ./tests/Test_hmc_EODWFRatio.cc |  | ||||||
|  |  | ||||||
| Copyright (C) 2015-2016 |  | ||||||
|  |  | ||||||
| Author: Peter Boyle <pabobyle@ph.ed.ac.uk> |  | ||||||
| Author: Guido Cossu <guido.cossu@ed.ac.uk> |  | ||||||
|  |  | ||||||
| This program is free software; you can redistribute it and/or modify |  | ||||||
| it under the terms of the GNU General Public License as published by |  | ||||||
| the Free Software Foundation; either version 2 of the License, or |  | ||||||
| (at your option) any later version. |  | ||||||
|  |  | ||||||
| This program is distributed in the hope that it will be useful, |  | ||||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| GNU General Public License for more details. |  | ||||||
|  |  | ||||||
| You should have received a copy of the GNU General Public License along |  | ||||||
| with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
| See the full license in the file "LICENSE" in the top level distribution |  | ||||||
| directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
| #include <Grid/Grid.h> |  | ||||||
|  |  | ||||||
| int main(int argc, char **argv) { |  | ||||||
|   using namespace Grid; |  | ||||||
|  |  | ||||||
|   Grid_init(&argc, &argv); |  | ||||||
|  |  | ||||||
|   CartesianCommunicator::BarrierWorld(); |  | ||||||
|   std::cout << GridLogMessage << " Clock skew check" <<std::endl; |  | ||||||
|    |  | ||||||
|   int threads = GridThread::GetThreads(); |  | ||||||
|  |  | ||||||
|    // Typedefs to simplify notation |  | ||||||
|   typedef WilsonImplD FermionImplPolicy; |  | ||||||
|   typedef MobiusFermionD FermionAction; |  | ||||||
|   typedef MobiusEOFAFermionD FermionEOFAAction; |  | ||||||
|   typedef typename FermionAction::FermionField FermionField; |  | ||||||
|  |  | ||||||
|   typedef Grid::XmlReader       Serialiser; |  | ||||||
|  |  | ||||||
|   //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: |  | ||||||
|   IntegratorParameters MD; |  | ||||||
|   //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; |  | ||||||
|   //  MD.name    = std::string("Leap Frog"); |  | ||||||
|   typedef GenericHMCRunner<ForceGradient> HMCWrapper; |  | ||||||
|   MD.name    = std::string("Force Gradient"); |  | ||||||
|   //typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; |  | ||||||
|   // MD.name    = std::string("MinimumNorm2"); |  | ||||||
|   // TrajL = 2 |  | ||||||
|   // 4/2 => 0.6 dH |  | ||||||
|   // 3/3 => 0.8 dH .. depth 3, slower |  | ||||||
|   //MD.MDsteps =  4; |  | ||||||
|   MD.MDsteps =  3; |  | ||||||
|   MD.trajL   = 0.5; |  | ||||||
|  |  | ||||||
|   HMCparameters HMCparams; |  | ||||||
|   HMCparams.StartTrajectory  = 1077; |  | ||||||
|   HMCparams.Trajectories     = 1; |  | ||||||
|   HMCparams.NoMetropolisUntil=  0; |  | ||||||
|   // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; |  | ||||||
|   //  HMCparams.StartingType     =std::string("ColdStart"); |  | ||||||
|   HMCparams.StartingType     =std::string("CheckpointStart"); |  | ||||||
|   HMCparams.MD = MD; |  | ||||||
|   HMCWrapper TheHMC(HMCparams); |  | ||||||
|  |  | ||||||
|   // Grid from the command line arguments --grid and --mpi |  | ||||||
|   TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition |  | ||||||
|  |  | ||||||
|   CheckpointerParameters CPparams; |  | ||||||
|   CPparams.config_prefix = "ckpoint_DDHMC_lat"; |  | ||||||
|   CPparams.rng_prefix    = "ckpoint_DDHMC_rng"; |  | ||||||
|   CPparams.saveInterval  = 1; |  | ||||||
|   CPparams.format        = "IEEE64BIG"; |  | ||||||
|   TheHMC.Resources.LoadNerscCheckpointer(CPparams); |  | ||||||
|   std::cout << "loaded NERSC checpointer"<<std::endl; |  | ||||||
|   RNGModuleParameters RNGpar; |  | ||||||
|   RNGpar.serial_seeds = "1 2 3 4 5"; |  | ||||||
|   RNGpar.parallel_seeds = "6 7 8 9 10"; |  | ||||||
|   TheHMC.Resources.SetRNGSeeds(RNGpar); |  | ||||||
|  |  | ||||||
|   // Construct observables |  | ||||||
|   // here there is too much indirection |  | ||||||
|   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs; |  | ||||||
|   TheHMC.Resources.AddObservable<PlaqObs>(); |  | ||||||
|   ////////////////////////////////////////////// |  | ||||||
|  |  | ||||||
|   const int Ls      = 12; |  | ||||||
|   RealD M5  = 1.8; |  | ||||||
|   RealD b   = 1.5; |  | ||||||
|   RealD c   = 0.5; |  | ||||||
|   Real beta         = 2.13; |  | ||||||
|   //  Real light_mass   = 5.4e-4; |  | ||||||
|   Real light_mass     = 7.8e-4; |  | ||||||
|   Real light_mass_dir = 0.01; |  | ||||||
|   Real strange_mass = 0.0362; |  | ||||||
|   Real pv_mass      = 1.0; |  | ||||||
|   std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); |  | ||||||
|   //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); |  | ||||||
|   //  std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated |  | ||||||
|   //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); |  | ||||||
|  |  | ||||||
|   int SP_iters=9000; |  | ||||||
|    |  | ||||||
|   RationalActionParams OFRp; // Up/down |  | ||||||
|   OFRp.lo       = 6.0e-5; |  | ||||||
|   OFRp.hi       = 90.0; |  | ||||||
|   OFRp.inv_pow  = 2; |  | ||||||
|   OFRp.MaxIter  = SP_iters; // get most shifts by 2000, stop sharing space |  | ||||||
|   OFRp.action_tolerance= 1.0e-8; |  | ||||||
|   OFRp.action_degree   = 18; |  | ||||||
|   OFRp.md_tolerance= 1.0e-7; |  | ||||||
|   OFRp.md_degree   = 14; |  | ||||||
|   //  OFRp.degree   = 20; converges |  | ||||||
|   //  OFRp.degree   = 16; |  | ||||||
|   OFRp.precision= 80; |  | ||||||
|   OFRp.BoundsCheckFreq=0; |  | ||||||
|   std::vector<RealD> ActionTolByPole({ |  | ||||||
|       //      1.0e-8,1.0e-8,1.0e-8,1.0e-8, |  | ||||||
|       3.0e-7,1.0e-7,1.0e-8,1.0e-8, |  | ||||||
|       1.0e-8,1.0e-8,1.0e-8,1.0e-8, |  | ||||||
|       1.0e-8,1.0e-8,1.0e-8,1.0e-8, |  | ||||||
|       1.0e-8,1.0e-8,1.0e-8,1.0e-8, |  | ||||||
|       1.0e-8,1.0e-8 |  | ||||||
|     }); |  | ||||||
|   std::vector<RealD> MDTolByPole({ |  | ||||||
|       //      1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more |  | ||||||
|       //      1.0e-6,3.0e-7,1.0e-7,1.0e-7, |  | ||||||
|       1.0e-5,1.0e-6,1.0e-7,1.0e-7, // soften convergence |  | ||||||
|       1.0e-8,1.0e-8,1.0e-8,1.0e-8, |  | ||||||
|       1.0e-8,1.0e-8,1.0e-8,1.0e-8, |  | ||||||
|       1.0e-8,1.0e-8 |  | ||||||
|     }); |  | ||||||
|  |  | ||||||
|   auto GridPtr   = TheHMC.Resources.GetCartesian(); |  | ||||||
|   auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); |  | ||||||
|  |  | ||||||
|   typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD; |  | ||||||
|   typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD; |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   // Domain decomposed |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   Coordinate latt4  = GridPtr->GlobalDimensions(); |  | ||||||
|   Coordinate mpi    = GridPtr->ProcessorGrid(); |  | ||||||
|   Coordinate shm; |  | ||||||
|  |  | ||||||
|   GlobalSharedMemory::GetShmDims(mpi,shm); |  | ||||||
|    |  | ||||||
|   Coordinate CommDim(Nd); |  | ||||||
|   for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0; |  | ||||||
|  |  | ||||||
|   Coordinate NonDirichlet(Nd+1,0); |  | ||||||
|   Coordinate Dirichlet(Nd+1,0); |  | ||||||
|   Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; |  | ||||||
|   Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; |  | ||||||
|   Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; |  | ||||||
|   Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; |  | ||||||
|   //Dirichlet[1] = 0; |  | ||||||
|   //Dirichlet[2] = 0; |  | ||||||
|   //Dirichlet[3] = 0; |  | ||||||
|  |  | ||||||
|   //  |  | ||||||
|   Coordinate Block4(Nd); |  | ||||||
|   Block4[0] = Dirichlet[1]; |  | ||||||
|   Block4[1] = Dirichlet[2]; |  | ||||||
|   Block4[2] = Dirichlet[3]; |  | ||||||
|   Block4[3] = Dirichlet[4]; |  | ||||||
|  |  | ||||||
|   int Width=4; |  | ||||||
|   TheHMC.Resources.SetMomentumFilter(new DDHMCFilter<WilsonImplD::Field>(Block4,Width)); |  | ||||||
|  |  | ||||||
|   ////////////////////////// |  | ||||||
|   // Fermion Grids |  | ||||||
|   ////////////////////////// |  | ||||||
|   auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); |  | ||||||
|   auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); |  | ||||||
|  |  | ||||||
|   IwasakiGaugeActionR GaugeAction(beta); |  | ||||||
|  |  | ||||||
|   // temporarily need a gauge field |  | ||||||
|   LatticeGaugeFieldD  U(GridPtr); U=Zero(); |  | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << " Running the HMC "<< std::endl; |  | ||||||
|   TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file |  | ||||||
|   TheHMC.initializeGaugeFieldAndRNGs(U); |  | ||||||
|   std::cout << "loaded NERSC gauge field"<<std::endl; |  | ||||||
|  |  | ||||||
|   // These lines are unecessary if BC are all periodic |  | ||||||
|   std::vector<Complex> boundary = {1,1,1,-1}; |  | ||||||
|   FermionAction::ImplParams Params(boundary); |  | ||||||
|   FermionAction::ImplParams ParamsDir(boundary); |  | ||||||
|  |  | ||||||
|   Params.dirichlet=NonDirichlet; |  | ||||||
|   ParamsDir.dirichlet=Dirichlet; |  | ||||||
|   ParamsDir.partialDirichlet=0; |  | ||||||
|   std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl; |  | ||||||
|  |  | ||||||
|   //  double StoppingCondition = 1e-14; |  | ||||||
|   //  double MDStoppingCondition = 1e-9; |  | ||||||
|   double StoppingCondition = 1e-8; |  | ||||||
|   double MDStoppingCondition = 1e-8; |  | ||||||
|   double MDStoppingConditionLoose = 1e-8; |  | ||||||
|   double MDStoppingConditionStrange = 1e-8; |  | ||||||
|   double MaxCGIterations = 300000; |  | ||||||
|   ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations); |  | ||||||
|   ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations); |  | ||||||
|  |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   // Collect actions |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   ActionLevel<HMCWrapper::Field> Level1(1); |  | ||||||
|   ActionLevel<HMCWrapper::Field> Level2(3); |  | ||||||
|   ActionLevel<HMCWrapper::Field> Level3(15); |  | ||||||
|  |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   // Strange action |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); |  | ||||||
|   FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params); |  | ||||||
|  |  | ||||||
|   // Probably dominates the force - back to EOFA. |  | ||||||
|   OneFlavourRationalParams SFRp; |  | ||||||
|   SFRp.lo       = 0.1; |  | ||||||
|   SFRp.hi       = 25.0; |  | ||||||
|   SFRp.MaxIter  = 10000; |  | ||||||
|   SFRp.tolerance= 1.0e-8; |  | ||||||
|   SFRp.mdtolerance= 2.0e-6; |  | ||||||
|   SFRp.degree   = 12; |  | ||||||
|   SFRp.precision= 50; |  | ||||||
|    |  | ||||||
|   MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); |  | ||||||
|   MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c); |  | ||||||
|   ConjugateGradient<FermionField>      ActionCG(StoppingCondition,MaxCGIterations); |  | ||||||
|   ConjugateGradient<FermionField>  DerivativeCG(MDStoppingCondition,MaxCGIterations); |  | ||||||
|   LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L); |  | ||||||
|   LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R); |  | ||||||
|  |  | ||||||
|   ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>  |  | ||||||
|     EOFA(Strange_Op_L, Strange_Op_R,  |  | ||||||
| 	 ActionCG,  |  | ||||||
| 	 ActionCG, ActionCG, |  | ||||||
| 	 DerivativeCG, DerivativeCG, |  | ||||||
| 	 SFRp, true); |  | ||||||
|   Level2.push_back(&EOFA); |  | ||||||
|  |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   // up down action |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   std::vector<Real> light_den; |  | ||||||
|   std::vector<Real> light_num; |  | ||||||
|   std::vector<int> dirichlet_den; |  | ||||||
|   std::vector<int> dirichlet_num; |  | ||||||
|  |  | ||||||
|   int n_hasenbusch = hasenbusch.size(); |  | ||||||
|   light_den.push_back(light_mass);  dirichlet_den.push_back(0); |  | ||||||
|   for(int h=0;h<n_hasenbusch;h++){ |  | ||||||
|     light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(1); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   for(int h=0;h<n_hasenbusch;h++){ |  | ||||||
|     light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(1); |  | ||||||
|   } |  | ||||||
|   light_num.push_back(pv_mass);  dirichlet_num.push_back(0); |  | ||||||
|  |  | ||||||
|   std::vector<FermionAction *> Numerators; |  | ||||||
|   std::vector<FermionAction *> Denominators; |  | ||||||
|   std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients; |  | ||||||
|    |  | ||||||
|   std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys; |  | ||||||
|  |  | ||||||
|   typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD; |  | ||||||
|   std::vector<LinearOperatorD *> LinOpD; |  | ||||||
|    |  | ||||||
|   for(int h=0;h<n_hasenbusch+1;h++){ |  | ||||||
|     std::cout << GridLogMessage |  | ||||||
| 	      << " 2f quotient Action "; |  | ||||||
|     std::cout << "det D("<<light_den[h]<<")"; |  | ||||||
|     if ( dirichlet_den[h] ) std::cout << "^dirichlet    "; |  | ||||||
|     std::cout << "/ det D("<<light_num[h]<<")"; |  | ||||||
|     if ( dirichlet_num[h] ) std::cout << "^dirichlet    "; |  | ||||||
|     std::cout << std::endl; |  | ||||||
|  |  | ||||||
|     FermionAction::ImplParams ParamsNum(boundary); |  | ||||||
|     FermionAction::ImplParams ParamsDen(boundary); |  | ||||||
|      |  | ||||||
|     if ( dirichlet_num[h]==1) ParamsNum.dirichlet = Dirichlet; |  | ||||||
|     else                      ParamsNum.dirichlet = NonDirichlet; |  | ||||||
|  |  | ||||||
|     if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet; |  | ||||||
|     else                      ParamsDen.dirichlet = NonDirichlet; |  | ||||||
|  |  | ||||||
|     if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1; |  | ||||||
|     else                      ParamsNum.partialDirichlet = 0; |  | ||||||
|  |  | ||||||
|     if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1; |  | ||||||
|     else                      ParamsDen.partialDirichlet = 0; |  | ||||||
|      |  | ||||||
|     Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum)); |  | ||||||
|     Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen)); |  | ||||||
|  |  | ||||||
|     LinOpD.push_back(new LinearOperatorD(*Denominators[h])); |  | ||||||
|  |  | ||||||
|     double conv  = MDStoppingCondition; |  | ||||||
|     if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors |  | ||||||
|      |  | ||||||
|     if(h!=0) { |  | ||||||
|       Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG)); |  | ||||||
|     } else { |  | ||||||
|       Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp)); |  | ||||||
|       Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp)); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   for(int h=0;h<Bdys.size();h++){ |  | ||||||
|     Bdys[h]->SetTolerances(ActionTolByPole,MDTolByPole); |  | ||||||
|   } |  | ||||||
|   int nquo=Quotients.size(); |  | ||||||
|   Level1.push_back(Bdys[0]); |  | ||||||
|   Level1.push_back(Bdys[1]); |  | ||||||
|   Level2.push_back(Quotients[0]); |  | ||||||
|   for(int h=1;h<nquo-1;h++){ |  | ||||||
|     Level2.push_back(Quotients[h]); |  | ||||||
|   } |  | ||||||
|   Level2.push_back(Quotients[nquo-1]); |  | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////// |  | ||||||
|   // Gauge action |  | ||||||
|   ///////////////////////////////////////////////////////////// |  | ||||||
|   Level3.push_back(&GaugeAction); |  | ||||||
|   TheHMC.TheAction.push_back(Level1); |  | ||||||
|   TheHMC.TheAction.push_back(Level2); |  | ||||||
|   TheHMC.TheAction.push_back(Level3); |  | ||||||
|   std::cout << GridLogMessage << " Action complete "<< std::endl; |  | ||||||
|   ///////////////////////////////////////////////////////////// |  | ||||||
|  |  | ||||||
|   TheHMC.Run();  // no smearing |  | ||||||
|  |  | ||||||
|   Grid_finalize(); |  | ||||||
| } // main |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -343,7 +343,7 @@ int main(int argc, char **argv) { | |||||||
|   // Probably dominates the force - back to EOFA. |   // Probably dominates the force - back to EOFA. | ||||||
|   OneFlavourRationalParams SFRp; |   OneFlavourRationalParams SFRp; | ||||||
|   SFRp.lo       = 0.1; |   SFRp.lo       = 0.1; | ||||||
|   SFRp.hi       = 30.0; |   SFRp.hi       = 25.0; | ||||||
|   SFRp.MaxIter  = 10000; |   SFRp.MaxIter  = 10000; | ||||||
|   SFRp.tolerance= 1.0e-5; |   SFRp.tolerance= 1.0e-5; | ||||||
|   SFRp.mdtolerance= 2.0e-4; |   SFRp.mdtolerance= 2.0e-4; | ||||||
|   | |||||||
| @@ -128,7 +128,7 @@ template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, c | |||||||
|       //////////////////////////////////////////////////////////////////////////////////// |       //////////////////////////////////////////////////////////////////////////////////// | ||||||
|       // Make a mixed precision conjugate gradient |       // Make a mixed precision conjugate gradient | ||||||
|       //////////////////////////////////////////////////////////////////////////////////// |       //////////////////////////////////////////////////////////////////////////////////// | ||||||
| #if 0 | #if 1 | ||||||
|       RealD delta=1.e-4; |       RealD delta=1.e-4; | ||||||
|       std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" <<std::endl; |       std::cout << GridLogMessage << "Calling reliable update Conjugate Gradient" <<std::endl; | ||||||
|       ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD); |       ConjugateGradientReliableUpdate<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations*MaxOuterIterations,delta,SinglePrecGrid5,LinOpF,LinOpD); | ||||||
| @@ -180,7 +180,7 @@ int main(int argc, char **argv) { | |||||||
|   // 4/2 => 0.6 dH |   // 4/2 => 0.6 dH | ||||||
|   // 3/3 => 0.8 dH .. depth 3, slower |   // 3/3 => 0.8 dH .. depth 3, slower | ||||||
|   //MD.MDsteps =  4; |   //MD.MDsteps =  4; | ||||||
|   MD.MDsteps =  12; |   MD.MDsteps =  14; | ||||||
|   MD.trajL   = 0.5; |   MD.trajL   = 0.5; | ||||||
|  |  | ||||||
|   HMCparameters HMCparams; |   HMCparameters HMCparams; | ||||||
| @@ -204,7 +204,7 @@ int main(int argc, char **argv) { | |||||||
|   TheHMC.Resources.LoadNerscCheckpointer(CPparams); |   TheHMC.Resources.LoadNerscCheckpointer(CPparams); | ||||||
|   std::cout << "loaded NERSC checpointer"<<std::endl; |   std::cout << "loaded NERSC checpointer"<<std::endl; | ||||||
|   RNGModuleParameters RNGpar; |   RNGModuleParameters RNGpar; | ||||||
|   RNGpar.serial_seeds = "1 2 3 4 5 6 7 8 9 10"; |   RNGpar.serial_seeds = "1 2 3 4 5"; | ||||||
|   RNGpar.parallel_seeds = "6 7 8 9 10"; |   RNGpar.parallel_seeds = "6 7 8 9 10"; | ||||||
|   TheHMC.Resources.SetRNGSeeds(RNGpar); |   TheHMC.Resources.SetRNGSeeds(RNGpar); | ||||||
|  |  | ||||||
| @@ -218,14 +218,15 @@ int main(int argc, char **argv) { | |||||||
|   RealD M5  = 1.8; |   RealD M5  = 1.8; | ||||||
|   RealD b   = 1.5; |   RealD b   = 1.5; | ||||||
|   RealD c   = 0.5; |   RealD c   = 0.5; | ||||||
|   RealD beta         = 2.13; |   Real beta         = 2.13; | ||||||
|   //  Real light_mass   = 5.4e-4; |   //  Real light_mass   = 5.4e-4; | ||||||
|   Real light_mass     = 7.8e-4; |   Real light_mass     = 7.8e-4; | ||||||
|   //  Real light_mass     = 7.8e-3; |  | ||||||
|   Real strange_mass = 0.0362; |   Real strange_mass = 0.0362; | ||||||
|   Real pv_mass      = 1.0; |   Real pv_mass      = 1.0; | ||||||
|   std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated |   //  std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); | ||||||
|   //std::vector<Real> hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated |   //  std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); | ||||||
|  |   std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 }); // Updated | ||||||
|  |   //  std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); | ||||||
|  |  | ||||||
|   auto GridPtr   = TheHMC.Resources.GetCartesian(); |   auto GridPtr   = TheHMC.Resources.GetCartesian(); | ||||||
|   auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); |   auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); | ||||||
| @@ -276,20 +277,20 @@ int main(int argc, char **argv) { | |||||||
|  |  | ||||||
|   //  double StoppingCondition = 1e-14; |   //  double StoppingCondition = 1e-14; | ||||||
|   //  double MDStoppingCondition = 1e-9; |   //  double MDStoppingCondition = 1e-9; | ||||||
|   double StoppingCondition = 1e-14; |   double StoppingCondition = 1e-9; | ||||||
|   double MDStoppingCondition = 1e-9; |   double MDStoppingCondition = 1e-8; | ||||||
|   double MDStoppingConditionLoose = 1e-9; |   double MDStoppingConditionLoose = 1e-8; | ||||||
|   double MDStoppingConditionStrange = 1e-9; |   double MDStoppingConditionStrange = 1e-8; | ||||||
|   double MaxCGIterations = 50000; |   double MaxCGIterations = 300000; | ||||||
|   ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations); |   ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations); | ||||||
|   ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations); |   ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations); | ||||||
|  |  | ||||||
|   //////////////////////////////////// |   //////////////////////////////////// | ||||||
|   // Collect actions |   // Collect actions | ||||||
|   //////////////////////////////////// |   //////////////////////////////////// | ||||||
|   ActionLevel<HMCWrapper::Field> Level1(1); |   //  ActionLevel<HMCWrapper::Field> Level1(1); | ||||||
|   ActionLevel<HMCWrapper::Field> Level2(2); |   ActionLevel<HMCWrapper::Field> Level2(1); | ||||||
|   ActionLevel<HMCWrapper::Field> Level3(4); |   ActionLevel<HMCWrapper::Field> Level3(15); | ||||||
|  |  | ||||||
|   //////////////////////////////////// |   //////////////////////////////////// | ||||||
|   // Strange action |   // Strange action | ||||||
| @@ -299,11 +300,11 @@ int main(int argc, char **argv) { | |||||||
|  |  | ||||||
|   // Probably dominates the force - back to EOFA. |   // Probably dominates the force - back to EOFA. | ||||||
|   OneFlavourRationalParams SFRp; |   OneFlavourRationalParams SFRp; | ||||||
|   SFRp.lo       = 0.8; |   SFRp.lo       = 0.1; | ||||||
|   SFRp.hi       = 30.0; |   SFRp.hi       = 30.0; | ||||||
|   SFRp.MaxIter  = 10000; |   SFRp.MaxIter  = 10000; | ||||||
|   SFRp.tolerance= 1.0e-12; |   SFRp.tolerance= 1.0e-8; | ||||||
|   SFRp.mdtolerance= 1.0e-9; |   SFRp.mdtolerance= 2.0e-6; | ||||||
|   SFRp.degree   = 10; |   SFRp.degree   = 10; | ||||||
|   SFRp.precision= 50; |   SFRp.precision= 50; | ||||||
|    |    | ||||||
| @@ -354,10 +355,8 @@ int main(int argc, char **argv) { | |||||||
|   ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>  |   ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>  | ||||||
|     EOFA(Strange_Op_L, Strange_Op_R,  |     EOFA(Strange_Op_L, Strange_Op_R,  | ||||||
| 	 ActionCG,  | 	 ActionCG,  | ||||||
| 	 //	 ActionCGL, ActionCGR, | 	 ActionCGL, ActionCGR, | ||||||
| 	 //	 DerivativeCGL, DerivativeCGR, | 	 DerivativeCGL, DerivativeCGR, | ||||||
| 	 ActionCG, ActionCG, |  | ||||||
| 	 DerivativeCG, DerivativeCG, |  | ||||||
| 	 SFRp, true); | 	 SFRp, true); | ||||||
|   Level2.push_back(&EOFA); |   Level2.push_back(&EOFA); | ||||||
|  |  | ||||||
| @@ -444,14 +443,13 @@ int main(int argc, char **argv) { | |||||||
|   } |   } | ||||||
|   int nquo=Quotients.size(); |   int nquo=Quotients.size(); | ||||||
|   for(int h=0;h<nquo;h++){ |   for(int h=0;h<nquo;h++){ | ||||||
|     Level1.push_back(Quotients[h]); |     Level2.push_back(Quotients[h]); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////// | ||||||
|   // Gauge action |   // Gauge action | ||||||
|   ///////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////// | ||||||
|   Level3.push_back(&GaugeAction); |   Level3.push_back(&GaugeAction); | ||||||
|   TheHMC.TheAction.push_back(Level1); |  | ||||||
|   TheHMC.TheAction.push_back(Level2); |   TheHMC.TheAction.push_back(Level2); | ||||||
|   TheHMC.TheAction.push_back(Level3); |   TheHMC.TheAction.push_back(Level3); | ||||||
|   std::cout << GridLogMessage << " Action complete "<< std::endl; |   std::cout << GridLogMessage << " Action complete "<< std::endl; | ||||||
|   | |||||||
| @@ -1,268 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
| Grid physics library, www.github.com/paboyle/Grid |  | ||||||
|  |  | ||||||
| Source file: ./tests/Test_hmc_EODWFRatio.cc |  | ||||||
|  |  | ||||||
| Copyright (C) 2015-2016 |  | ||||||
|  |  | ||||||
| Author: Peter Boyle <pabobyle@ph.ed.ac.uk> |  | ||||||
| Author: Guido Cossu <guido.cossu@ed.ac.uk> |  | ||||||
|  |  | ||||||
| This program is free software; you can redistribute it and/or modify |  | ||||||
| it under the terms of the GNU General Public License as published by |  | ||||||
| the Free Software Foundation; either version 2 of the License, or |  | ||||||
| (at your option) any later version. |  | ||||||
|  |  | ||||||
| This program is distributed in the hope that it will be useful, |  | ||||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| GNU General Public License for more details. |  | ||||||
|  |  | ||||||
| You should have received a copy of the GNU General Public License along |  | ||||||
| with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
| See the full license in the file "LICENSE" in the top level distribution |  | ||||||
| directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
| #include <Grid/Grid.h> |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| int main(int argc, char **argv) { |  | ||||||
|   using namespace Grid; |  | ||||||
|  |  | ||||||
|   std::cout << " Grid Initialise "<<std::endl; |  | ||||||
|    |  | ||||||
|   Grid_init(&argc, &argv); |  | ||||||
|  |  | ||||||
|   CartesianCommunicator::BarrierWorld(); |  | ||||||
|   std::cout << GridLogMessage << " Clock skew check" <<std::endl; |  | ||||||
|    |  | ||||||
|   int threads = GridThread::GetThreads(); |  | ||||||
|  |  | ||||||
|    // Typedefs to simplify notation |  | ||||||
|   typedef WilsonImplD FermionImplPolicy; |  | ||||||
|   typedef MobiusFermionD FermionAction; |  | ||||||
|   typedef MobiusEOFAFermionD FermionEOFAAction; |  | ||||||
|   typedef typename FermionAction::FermionField FermionField; |  | ||||||
|  |  | ||||||
|   typedef WilsonImplF FermionImplPolicyF; |  | ||||||
|   typedef MobiusFermionF FermionActionF; |  | ||||||
|   typedef MobiusEOFAFermionF FermionEOFAActionF; |  | ||||||
|   typedef typename FermionActionF::FermionField FermionFieldF; |  | ||||||
|  |  | ||||||
|   typedef Grid::XmlReader       Serialiser; |  | ||||||
|  |  | ||||||
|   //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: |  | ||||||
|   IntegratorParameters MD; |  | ||||||
|   //  typedef GenericHMCRunner<LeapFrog> HMCWrapper; |  | ||||||
|   //  MD.name    = std::string("Leap Frog"); |  | ||||||
|   typedef GenericHMCRunner<ForceGradient> HMCWrapper; |  | ||||||
|   MD.name    = std::string("Force Gradient"); |  | ||||||
|   //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper; |  | ||||||
|   //  MD.name    = std::string("MinimumNorm2"); |  | ||||||
|   // TrajL = 2 |  | ||||||
|   // 4/2 => 0.6 dH |  | ||||||
|   // 3/3 => 0.8 dH .. depth 3, slower |  | ||||||
|   //MD.MDsteps =  4; |  | ||||||
|   MD.MDsteps =  8; |  | ||||||
|   MD.trajL   = 0.5; |  | ||||||
|  |  | ||||||
|   HMCparameters HMCparams; |  | ||||||
|   HMCparams.StartTrajectory  = 1077; |  | ||||||
|   HMCparams.Trajectories     = 20; |  | ||||||
|   HMCparams.NoMetropolisUntil=  0; |  | ||||||
|   // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; |  | ||||||
|   HMCparams.StartingType     =std::string("ColdStart"); |  | ||||||
|   //  HMCparams.StartingType     =std::string("CheckpointStart"); |  | ||||||
|   HMCparams.MD = MD; |  | ||||||
|   HMCWrapper TheHMC(HMCparams); |  | ||||||
|  |  | ||||||
|   // Grid from the command line arguments --grid and --mpi |  | ||||||
|   TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition |  | ||||||
|  |  | ||||||
|   CheckpointerParameters CPparams; |  | ||||||
|   CPparams.config_prefix = "ckpoint_HMC_lat"; |  | ||||||
|   CPparams.rng_prefix    = "ckpoint_HMC_rng"; |  | ||||||
|   CPparams.saveInterval  = 1; |  | ||||||
|   CPparams.format        = "IEEE64BIG"; |  | ||||||
|   TheHMC.Resources.LoadNerscCheckpointer(CPparams); |  | ||||||
|   std::cout << "loaded NERSC checpointer"<<std::endl; |  | ||||||
|   RNGModuleParameters RNGpar; |  | ||||||
|   RNGpar.serial_seeds = "1 2 3 4 5 6 7 8 9 10"; |  | ||||||
|   RNGpar.parallel_seeds = "6 7 8 9 10"; |  | ||||||
|   TheHMC.Resources.SetRNGSeeds(RNGpar); |  | ||||||
|  |  | ||||||
|   // Construct observables |  | ||||||
|   // here there is too much indirection |  | ||||||
|   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs; |  | ||||||
|   TheHMC.Resources.AddObservable<PlaqObs>(); |  | ||||||
|   ////////////////////////////////////////////// |  | ||||||
|  |  | ||||||
|   const int Ls      = 12; |  | ||||||
|   RealD M5  = 1.8; |  | ||||||
|   RealD b   = 1.5; |  | ||||||
|   RealD c   = 0.5; |  | ||||||
|   RealD beta         = 2.13; |  | ||||||
|   //  Real light_mass   = 5.4e-4; |  | ||||||
|   Real light_mass     = 7.8e-4; |  | ||||||
|   //  Real light_mass     = 7.8e-3; |  | ||||||
|   Real strange_mass = 0.0362; |  | ||||||
|   Real pv_mass      = 1.0; |  | ||||||
|   std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated |  | ||||||
|   //std::vector<Real> hasenbusch({ 0.0145, 0.045, 0.108, 0.25, 0.35 , 0.51, 0.6, 0.8 }); // Updated |  | ||||||
|  |  | ||||||
|   auto GridPtr   = TheHMC.Resources.GetCartesian(); |  | ||||||
|   auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); |  | ||||||
|  |  | ||||||
|   typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD; |  | ||||||
|   typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD; |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   // Domain decomposed |  | ||||||
|   //////////////////////////////////////////////////////////////// |  | ||||||
|   Coordinate latt4  = GridPtr->GlobalDimensions(); |  | ||||||
|   Coordinate mpi    = GridPtr->ProcessorGrid(); |  | ||||||
|   Coordinate shm; |  | ||||||
|  |  | ||||||
|   GlobalSharedMemory::GetShmDims(mpi,shm); |  | ||||||
|  |  | ||||||
|   ////////////////////////// |  | ||||||
|   // Fermion Grids |  | ||||||
|   ////////////////////////// |  | ||||||
|   auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); |  | ||||||
|   auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); |  | ||||||
|  |  | ||||||
|   IwasakiGaugeActionR GaugeAction(beta); |  | ||||||
|  |  | ||||||
|   // temporarily need a gauge field |  | ||||||
|   LatticeGaugeFieldD  U(GridPtr); U=Zero(); |  | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << " Running the HMC "<< std::endl; |  | ||||||
|   TheHMC.ReadCommandLine(argc,argv);  // params on CML or from param file |  | ||||||
|   TheHMC.initializeGaugeFieldAndRNGs(U); |  | ||||||
|   std::cout << "loaded NERSC gauge field"<<std::endl; |  | ||||||
|  |  | ||||||
|   // These lines are unecessary if BC are all periodic |  | ||||||
|   std::vector<Complex> boundary = {1,1,1,-1}; |  | ||||||
|   FermionAction::ImplParams Params(boundary); |  | ||||||
|  |  | ||||||
|   //  double StoppingCondition = 1e-14; |  | ||||||
|   //  double MDStoppingCondition = 1e-9; |  | ||||||
|   double StoppingCondition = 1e-14; |  | ||||||
|   double MDStoppingCondition = 1e-9; |  | ||||||
|   double MDStoppingConditionLoose = 1e-9; |  | ||||||
|   double MDStoppingConditionStrange = 1e-9; |  | ||||||
|   double MaxCGIterations = 50000; |  | ||||||
|   ConjugateGradient<FermionField>  CG(StoppingCondition,MaxCGIterations); |  | ||||||
|   ConjugateGradient<FermionField>  MDCG(MDStoppingCondition,MaxCGIterations); |  | ||||||
|  |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   // Collect actions |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   ActionLevel<HMCWrapper::Field> Level1(1); |  | ||||||
|   ActionLevel<HMCWrapper::Field> Level2(2); |  | ||||||
|   ActionLevel<HMCWrapper::Field> Level3(4); |  | ||||||
|  |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   // Strange action |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   FermionAction StrangeOp (U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,strange_mass,M5,b,c, Params); |  | ||||||
|   FermionAction StrangePauliVillarsOp(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,pv_mass,  M5,b,c, Params); |  | ||||||
|  |  | ||||||
|   // Probably dominates the force - back to EOFA. |  | ||||||
|   OneFlavourRationalParams SFRp; |  | ||||||
|   SFRp.lo       = 0.8; |  | ||||||
|   SFRp.hi       = 30.0; |  | ||||||
|   SFRp.MaxIter  = 10000; |  | ||||||
|   SFRp.tolerance= 1.0e-12; |  | ||||||
|   SFRp.mdtolerance= 1.0e-9; |  | ||||||
|   SFRp.degree   = 10; |  | ||||||
|   SFRp.precision= 50; |  | ||||||
|    |  | ||||||
|   MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c); |  | ||||||
|   MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass,      pv_mass, -1.0, 1, M5, b, c); |  | ||||||
|   ConjugateGradient<FermionField>      ActionCG(StoppingCondition,MaxCGIterations); |  | ||||||
|   ConjugateGradient<FermionField>  DerivativeCG(MDStoppingCondition,MaxCGIterations); |  | ||||||
|   LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L); |  | ||||||
|   LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R); |  | ||||||
|  |  | ||||||
|   ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>  |  | ||||||
|     EOFA(Strange_Op_L, Strange_Op_R,  |  | ||||||
| 	 ActionCG,  |  | ||||||
| 	 ActionCG, ActionCG, |  | ||||||
| 	 DerivativeCG, DerivativeCG, |  | ||||||
| 	 SFRp, true); |  | ||||||
|   Level2.push_back(&EOFA); |  | ||||||
|  |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   // up down action |  | ||||||
|   //////////////////////////////////// |  | ||||||
|   std::vector<Real> light_den; |  | ||||||
|   std::vector<Real> light_num; |  | ||||||
|  |  | ||||||
|   int n_hasenbusch = hasenbusch.size(); |  | ||||||
|   light_den.push_back(light_mass);  |  | ||||||
|   for(int h=0;h<n_hasenbusch;h++){ |  | ||||||
|     light_den.push_back(hasenbusch[h]); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   for(int h=0;h<n_hasenbusch;h++){ |  | ||||||
|     light_num.push_back(hasenbusch[h]); |  | ||||||
|   } |  | ||||||
|   light_num.push_back(pv_mass); |  | ||||||
|  |  | ||||||
|   std::vector<FermionAction *> Numerators; |  | ||||||
|   std::vector<FermionAction *> Denominators; |  | ||||||
|   std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients; |  | ||||||
|    |  | ||||||
|   std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys; |  | ||||||
|  |  | ||||||
|   typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD; |  | ||||||
|   std::vector<LinearOperatorD *> LinOpD; |  | ||||||
|    |  | ||||||
|   for(int h=0;h<n_hasenbusch+1;h++){ |  | ||||||
|     std::cout << GridLogMessage |  | ||||||
| 	      << " 2f quotient Action "; |  | ||||||
|     std::cout << "det D("<<light_den[h]<<")"; |  | ||||||
|     std::cout << "/ det D("<<light_num[h]<<")"; |  | ||||||
|     std::cout << std::endl; |  | ||||||
|  |  | ||||||
|     FermionAction::ImplParams ParamsNum(boundary); |  | ||||||
|     FermionAction::ImplParams ParamsDen(boundary); |  | ||||||
|      |  | ||||||
|     Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum)); |  | ||||||
|     Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen)); |  | ||||||
|  |  | ||||||
|     LinOpD.push_back(new LinearOperatorD(*Denominators[h])); |  | ||||||
|  |  | ||||||
|     double conv  = MDStoppingCondition; |  | ||||||
|     if (h<3) conv= MDStoppingConditionLoose; // Relax on first two hasenbusch factors |  | ||||||
|      |  | ||||||
|     Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],MDCG,CG,CG)); |  | ||||||
|   } |  | ||||||
|   int nquo=Quotients.size(); |  | ||||||
|   for(int h=0;h<nquo;h++){ |  | ||||||
|     Level1.push_back(Quotients[h]); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////// |  | ||||||
|   // Gauge action |  | ||||||
|   ///////////////////////////////////////////////////////////// |  | ||||||
|   Level3.push_back(&GaugeAction); |  | ||||||
|   TheHMC.TheAction.push_back(Level1); |  | ||||||
|   TheHMC.TheAction.push_back(Level2); |  | ||||||
|   TheHMC.TheAction.push_back(Level3); |  | ||||||
|   std::cout << GridLogMessage << " Action complete "<< std::endl; |  | ||||||
|   ///////////////////////////////////////////////////////////// |  | ||||||
|  |  | ||||||
|   TheHMC.Run();  // no smearing |  | ||||||
|  |  | ||||||
|   Grid_finalize(); |  | ||||||
| } // main |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
							
								
								
									
										637
									
								
								HMC/Mobius2p1p1fEOFA_4Gev.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										637
									
								
								HMC/Mobius2p1p1fEOFA_4Gev.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,637 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  | Grid physics library, www.github.com/paboyle/Grid | ||||||
|  |  | ||||||
|  | Source file:  | ||||||
|  |  | ||||||
|  | Copyright (C) 2015-2016 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <pabobyle@ph.ed.ac.uk> | ||||||
|  | Author: Guido Cossu | ||||||
|  | Author: David Murphy | ||||||
|  | Author: Chulwoo Jung <chulwoo@bnl.gov> | ||||||
|  |  | ||||||
|  | This program is free software; you can redistribute it and/or modify | ||||||
|  | it under the terms of the GNU General Public License as published by | ||||||
|  | the Free Software Foundation; either version 2 of the License, or | ||||||
|  | (at your option) any later version. | ||||||
|  |  | ||||||
|  | This program is distributed in the hope that it will be useful, | ||||||
|  | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | GNU General Public License for more details. | ||||||
|  |  | ||||||
|  | You should have received a copy of the GNU General Public License along | ||||||
|  | with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  | See the full license in the file "LICENSE" in the top level distribution | ||||||
|  | directory | ||||||
|  | *************************************************************************************/ | ||||||
|  | /*  END LEGAL */ | ||||||
|  | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
|  | #ifdef GRID_DEFAULT_PRECISION_DOUBLE | ||||||
|  | #define MIXED_PRECISION | ||||||
|  | #endif | ||||||
|  | // second level EOFA | ||||||
|  | #undef EOFA_H | ||||||
|  | #undef USE_OBC | ||||||
|  | #define DO_IMPLICIT | ||||||
|  |  | ||||||
|  | NAMESPACE_BEGIN(Grid); | ||||||
|  |  | ||||||
|  |   /* | ||||||
|  |    * Need a plan for gauge field update for mixed precision in HMC                      (2x speed up) | ||||||
|  |    *    -- Store the single prec action operator. | ||||||
|  |    *    -- Clone the gauge field from the operator function argument. | ||||||
|  |    *    -- Build the mixed precision operator dynamically from the passed operator and single prec clone. | ||||||
|  |    */ | ||||||
|  |  | ||||||
|  |   template<class FermionOperatorD, class FermionOperatorF, class SchurOperatorD, class  SchurOperatorF>  | ||||||
|  |   class MixedPrecisionConjugateGradientOperatorFunction : public OperatorFunction<typename FermionOperatorD::FermionField> { | ||||||
|  |   public: | ||||||
|  |     typedef typename FermionOperatorD::FermionField FieldD; | ||||||
|  |     typedef typename FermionOperatorF::FermionField FieldF; | ||||||
|  |  | ||||||
|  |     using OperatorFunction<FieldD>::operator(); | ||||||
|  |  | ||||||
|  |     RealD   Tolerance; | ||||||
|  |     RealD   InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed | ||||||
|  |     Integer MaxInnerIterations; | ||||||
|  |     Integer MaxOuterIterations; | ||||||
|  |     GridBase* SinglePrecGrid4; //Grid for single-precision fields | ||||||
|  |     GridBase* SinglePrecGrid5; //Grid for single-precision fields | ||||||
|  |     RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance | ||||||
|  |  | ||||||
|  |     FermionOperatorF &FermOpF; | ||||||
|  |     FermionOperatorD &FermOpD;; | ||||||
|  |     SchurOperatorF &LinOpF; | ||||||
|  |     SchurOperatorD &LinOpD; | ||||||
|  |  | ||||||
|  |     Integer TotalInnerIterations; //Number of inner CG iterations | ||||||
|  |     Integer TotalOuterIterations; //Number of restarts | ||||||
|  |     Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step | ||||||
|  |  | ||||||
|  |     MixedPrecisionConjugateGradientOperatorFunction(RealD tol,  | ||||||
|  | 						    Integer maxinnerit,  | ||||||
|  | 						    Integer maxouterit,  | ||||||
|  | 						    GridBase* _sp_grid4,  | ||||||
|  | 						    GridBase* _sp_grid5,  | ||||||
|  | 						    FermionOperatorF &_FermOpF, | ||||||
|  | 						    FermionOperatorD &_FermOpD, | ||||||
|  | 						    SchurOperatorF   &_LinOpF, | ||||||
|  | 						    SchurOperatorD   &_LinOpD):  | ||||||
|  |       LinOpF(_LinOpF), | ||||||
|  |       LinOpD(_LinOpD), | ||||||
|  |       FermOpF(_FermOpF), | ||||||
|  |       FermOpD(_FermOpD), | ||||||
|  |       Tolerance(tol),  | ||||||
|  |       InnerTolerance(tol),  | ||||||
|  |       MaxInnerIterations(maxinnerit),  | ||||||
|  |       MaxOuterIterations(maxouterit),  | ||||||
|  |       SinglePrecGrid4(_sp_grid4), | ||||||
|  |       SinglePrecGrid5(_sp_grid5), | ||||||
|  |       OuterLoopNormMult(100.)  | ||||||
|  |     {  | ||||||
|  |       /* Debugging instances of objects; references are stored | ||||||
|  |       std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpF " <<std::hex<< &LinOpF<<std::dec <<std::endl; | ||||||
|  |       std::cout << GridLogMessage << " Mixed precision CG wrapper LinOpD " <<std::hex<< &LinOpD<<std::dec <<std::endl; | ||||||
|  |       std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpF " <<std::hex<< &FermOpF<<std::dec <<std::endl; | ||||||
|  |       std::cout << GridLogMessage << " Mixed precision CG wrapper FermOpD " <<std::hex<< &FermOpD<<std::dec <<std::endl; | ||||||
|  |       */ | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     void operator()(LinearOperatorBase<FieldD> &LinOpU, const FieldD &src, FieldD &psi) { | ||||||
|  |  | ||||||
|  |       std::cout << GridLogMessage << " Mixed precision CG wrapper operator() "<<std::endl; | ||||||
|  |  | ||||||
|  |       SchurOperatorD * SchurOpU = static_cast<SchurOperatorD *>(&LinOpU); | ||||||
|  |        | ||||||
|  |       //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpU " <<std::hex<< &(SchurOpU->_Mat)<<std::dec <<std::endl; | ||||||
|  |       //      std::cout << GridLogMessage << " Mixed precision CG wrapper operator() FermOpD " <<std::hex<< &(LinOpD._Mat) <<std::dec <<std::endl; | ||||||
|  |       // Assumption made in code to extract gauge field | ||||||
|  |       // We could avoid storing LinopD reference alltogether ? | ||||||
|  |       assert(&(SchurOpU->_Mat)==&(LinOpD._Mat)); | ||||||
|  |  | ||||||
|  |       //////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |       // Must snarf a single precision copy of the gauge field in Linop_d argument | ||||||
|  |       //////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |       typedef typename FermionOperatorF::GaugeField GaugeFieldF; | ||||||
|  |       typedef typename FermionOperatorF::GaugeLinkField GaugeLinkFieldF; | ||||||
|  |       typedef typename FermionOperatorD::GaugeField GaugeFieldD; | ||||||
|  |       typedef typename FermionOperatorD::GaugeLinkField GaugeLinkFieldD; | ||||||
|  |  | ||||||
|  |       GridBase * GridPtrF = SinglePrecGrid4; | ||||||
|  |       GridBase * GridPtrD = FermOpD.Umu.Grid(); | ||||||
|  |       GaugeFieldF     U_f  (GridPtrF); | ||||||
|  |       GaugeLinkFieldF Umu_f(GridPtrF); | ||||||
|  |       //      std::cout << " Dim gauge field "<<GridPtrF->Nd()<<std::endl; // 4d | ||||||
|  |       //      std::cout << " Dim gauge field "<<GridPtrD->Nd()<<std::endl; // 4d | ||||||
|  |  | ||||||
|  |       //////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |       // Moving this to a Clone method of fermion operator would allow to duplicate the  | ||||||
|  |       // physics parameters and decrease gauge field copies | ||||||
|  |       //////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |       GaugeLinkFieldD Umu_d(GridPtrD); | ||||||
|  |       for(int mu=0;mu<Nd*2;mu++){  | ||||||
|  | 	Umu_d = PeekIndex<LorentzIndex>(FermOpD.Umu, mu); | ||||||
|  | 	precisionChange(Umu_f,Umu_d); | ||||||
|  | 	PokeIndex<LorentzIndex>(FermOpF.Umu, Umu_f, mu); | ||||||
|  |       } | ||||||
|  |       pickCheckerboard(Even,FermOpF.UmuEven,FermOpF.Umu); | ||||||
|  |       pickCheckerboard(Odd ,FermOpF.UmuOdd ,FermOpF.Umu); | ||||||
|  |  | ||||||
|  |       //////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |       // Make a mixed precision conjugate gradient | ||||||
|  |       //////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |       MixedPrecisionConjugateGradient<FieldD,FieldF> MPCG(Tolerance,MaxInnerIterations,MaxOuterIterations,SinglePrecGrid5,LinOpF,LinOpD); | ||||||
|  |       std::cout << GridLogMessage << "Calling mixed precision Conjugate Gradient" <<std::endl; | ||||||
|  |       MPCG(src,psi); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  | NAMESPACE_END(Grid); | ||||||
|  |  | ||||||
|  |  | ||||||
|  | int main(int argc, char **argv) { | ||||||
|  |   using namespace Grid; | ||||||
|  |  | ||||||
|  |   Grid_init(&argc, &argv); | ||||||
|  |   int threads = GridThread::GetThreads(); | ||||||
|  |   // here make a routine to print all the relevant information on the run | ||||||
|  |   std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl; | ||||||
|  |  | ||||||
|  |    // Typedefs to simplify notation | ||||||
|  |   typedef WilsonImplR FermionImplPolicy; | ||||||
|  |   typedef MobiusFermionD FermionAction; | ||||||
|  |   typedef MobiusFermionF FermionActionF; | ||||||
|  |   typedef MobiusEOFAFermionD FermionEOFAAction; | ||||||
|  |   typedef MobiusEOFAFermionF FermionEOFAActionF; | ||||||
|  |   typedef typename FermionAction::FermionField FermionField; | ||||||
|  |   typedef typename FermionActionF::FermionField FermionFieldF; | ||||||
|  |  | ||||||
|  |   typedef Grid::XmlReader       Serialiser; | ||||||
|  |    | ||||||
|  |   //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: | ||||||
|  |  | ||||||
|  |   HMCparameters HMCparams; | ||||||
|  | #if 1 | ||||||
|  |   { | ||||||
|  |     XmlReader  HMCrd("HMCparameters.xml"); | ||||||
|  |     read(HMCrd,"HMCparameters",HMCparams); | ||||||
|  |   } | ||||||
|  | #else | ||||||
|  |   { | ||||||
|  | //    HMCparameters HMCparams; | ||||||
|  |   //  "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; | ||||||
|  |   //  HMCparams.StartingType     =std::string("ColdStart"); | ||||||
|  |     HMCparams.StartingType     =std::string("CheckpointStart"); | ||||||
|  |     HMCparams.StartTrajectory  =7; | ||||||
|  |     HMCparams.SW  =4; | ||||||
|  |     HMCparams.Trajectories     =1000; | ||||||
|  |     HMCparams.NoMetropolisUntil=0; | ||||||
|  |     HMCparams.MD.name          =std::string("Force Gradient"); | ||||||
|  |     HMCparams.MD.MDsteps       = 10; | ||||||
|  |     HMCparams.MD.trajL         = 1.0; | ||||||
|  |   } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #ifdef DO_IMPLICIT | ||||||
|  | //    typedef GenericHMCRunner<ImplicitLeapFrog> HMCWrapper;  | ||||||
|  |   typedef GenericHMCRunner<ImplicitMinimumNorm2> HMCWrapper;  | ||||||
|  |   HMCparams.MD.name          =std::string("ImplicitMinimumNorm2"); | ||||||
|  | #else | ||||||
|  | //  typedef GenericHMCRunner<LeapFrog> HMCWrapper;  | ||||||
|  |   typedef GenericHMCRunner<ForceGradient> HMCWrapper;  | ||||||
|  | //  typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;  | ||||||
|  |   HMCparams.MD.name          =std::string("ForceGradient"); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage<< HMCparams <<std::endl; | ||||||
|  |   HMCWrapper TheHMC(HMCparams); | ||||||
|  |   TheHMC.ReadCommandLine(argc, argv); | ||||||
|  |   {  | ||||||
|  |     XmlWriter HMCwr("HMCparameters.xml.out"); | ||||||
|  |     write(HMCwr,"HMCparameters",TheHMC.Parameters); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   // Grid from the command line arguments --grid and --mpi | ||||||
|  |   TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition | ||||||
|  |    | ||||||
|  |   CheckpointerParameters CPparams; | ||||||
|  |   CPparams.config_prefix = "ckpoint_lat"; | ||||||
|  |   CPparams.rng_prefix    = "ckpoint_rng"; | ||||||
|  |   CPparams.saveInterval  = 1; | ||||||
|  |   CPparams.format        = "IEEE64BIG"; | ||||||
|  |   TheHMC.Resources.LoadNerscCheckpointer(CPparams); | ||||||
|  |  | ||||||
|  |   RNGModuleParameters RNGpar; | ||||||
|  |   RNGpar.serial_seeds = "1 2 3 4 5"; | ||||||
|  |   RNGpar.parallel_seeds = "6 7 8 9 10"; | ||||||
|  |   TheHMC.Resources.SetRNGSeeds(RNGpar); | ||||||
|  |  | ||||||
|  |   // Construct observables | ||||||
|  |   // here there is too much indirection  | ||||||
|  |   typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs; | ||||||
|  |   TheHMC.Resources.AddObservable<PlaqObs>(); | ||||||
|  |   ////////////////////////////////////////////// | ||||||
|  |  | ||||||
|  |   const int Ls      = 12; | ||||||
|  |   Real beta         = 5.983; | ||||||
|  |   std::cout << GridLogMessage << " beta  "<< beta << std::endl; | ||||||
|  |   Real light_mass   = 0.00049; | ||||||
|  |   Real strange_mass = 0.0158; | ||||||
|  |   Real charm_mass = 0.191; | ||||||
|  |   Real pv_mass    = 1.0; | ||||||
|  |   RealD M5  = 1.4; | ||||||
|  |   RealD b   = 2.0;  | ||||||
|  |   RealD c   = 1.0; | ||||||
|  |  | ||||||
|  |   // Copied from paper | ||||||
|  | //  std::vector<Real> hasenbusch({ 0.045 }); // Paper values from F1 incorrect run | ||||||
|  |   std::vector<Real> hasenbusch({ 0.0038, 0.0145, 0.045, 0.108 , 0.25, 0.51 }); // Paper values from F1 incorrect run | ||||||
|  |   std::vector<Real> hasenbusch2({ 0.4 }); // Paper values from F1 incorrect run | ||||||
|  |  | ||||||
|  | //  RealD eofa_mass=0.05 ; | ||||||
|  |  | ||||||
|  |   /////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   //Bad choices with large dH. Equalising force L2 norm was not wise. | ||||||
|  |   /////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   //std::vector<Real> hasenbusch({ 0.03, 0.2, 0.3, 0.5, 0.8 });  | ||||||
|  |  | ||||||
|  |   auto GridPtr   = TheHMC.Resources.GetCartesian(); | ||||||
|  |   auto GridRBPtr = TheHMC.Resources.GetRBCartesian(); | ||||||
|  |   auto FGrid     = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr); | ||||||
|  |   auto FrbGrid   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr); | ||||||
|  |  | ||||||
|  |   Coordinate latt  = GridDefaultLatt(); | ||||||
|  |   Coordinate mpi   = GridDefaultMpi(); | ||||||
|  |   Coordinate simdF = GridDefaultSimd(Nd,vComplexF::Nsimd()); | ||||||
|  |   Coordinate simdD = GridDefaultSimd(Nd,vComplexD::Nsimd()); | ||||||
|  | //  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi); | ||||||
|  |   auto UGrid_f    = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi); | ||||||
|  |   auto GridRBPtrF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f); | ||||||
|  |   auto FGridF     = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f); | ||||||
|  |   auto FrbGridF   = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f); | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #ifndef USE_OBC | ||||||
|  | //  IwasakiGaugeActionR GaugeAction(beta); | ||||||
|  |   WilsonGaugeActionR GaugeAction(beta); | ||||||
|  | #else | ||||||
|  |   std::vector<Complex> boundaryG = {1,1,1,0}; | ||||||
|  |   WilsonGaugeActionR::ImplParams ParamsG(boundaryG); | ||||||
|  |   WilsonGaugeActionR GaugeAction(beta,ParamsG); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |   // temporarily need a gauge field | ||||||
|  |   LatticeGaugeField U(GridPtr); | ||||||
|  |   LatticeGaugeFieldF UF(UGrid_f); | ||||||
|  |  | ||||||
|  |   // These lines are unecessary if BC are all periodic | ||||||
|  | #ifndef USE_OBC | ||||||
|  |   std::vector<Complex> boundary = {1,1,1,-1}; | ||||||
|  | #else | ||||||
|  |   std::vector<Complex> boundary = {1,1,1,0}; | ||||||
|  | #endif | ||||||
|  |   FermionAction::ImplParams Params(boundary); | ||||||
|  |   FermionActionF::ImplParams ParamsF(boundary); | ||||||
|  |    | ||||||
|  |   double ActionStoppingCondition     = 1e-8; | ||||||
|  |   double DerivativeStoppingCondition = 1e-8; | ||||||
|  |   double MaxCGIterations =  100000; | ||||||
|  |  | ||||||
|  |   //////////////////////////////////// | ||||||
|  |   // Collect actions | ||||||
|  |   //////////////////////////////////// | ||||||
|  |   ActionLevel<HMCWrapper::Field> Level1(1); | ||||||
|  |   ActionLevel<HMCWrapper::Field> Level2(HMCparams.SW); | ||||||
|  |  | ||||||
|  |   //////////////////////////////////// | ||||||
|  |   // Strange action | ||||||
|  |   //////////////////////////////////// | ||||||
|  |   typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF; | ||||||
|  |   typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD; | ||||||
|  |   typedef SchurDiagMooeeOperator<FermionEOFAActionF,FermionFieldF> LinearOperatorEOFAF; | ||||||
|  |   typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD; | ||||||
|  |  | ||||||
|  |   typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG; | ||||||
|  |   typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusEOFAFermionD,MobiusEOFAFermionF,LinearOperatorEOFAD,LinearOperatorEOFAF> MxPCG_EOFA; | ||||||
|  |  | ||||||
|  |   // DJM: setup for EOFA ratio (Mobius) | ||||||
|  |   OneFlavourRationalParams OFRp; | ||||||
|  |   OFRp.lo       = 0.99; // How do I know this on F1? | ||||||
|  |   OFRp.hi       = 20; | ||||||
|  |   OFRp.MaxIter  = 100000; | ||||||
|  |   OFRp.tolerance= 1.0e-12; | ||||||
|  |   OFRp.degree   = 12; | ||||||
|  |   OFRp.precision= 50; | ||||||
|  |  | ||||||
|  |    | ||||||
|  |   MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, charm_mass, 0.0, -1, M5, b, c); | ||||||
|  |   MobiusEOFAFermionF Strange_Op_LF(UF, *FGridF, *FrbGridF, *UGrid_f, *GridRBPtrF, strange_mass, strange_mass, charm_mass, 0.0, -1, M5, b, c); | ||||||
|  |   MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , charm_mass, strange_mass,      charm_mass, -1.0, 1, M5, b, c); | ||||||
|  |   MobiusEOFAFermionF Strange_Op_RF(UF, *FGridF, *FrbGridF, *UGrid_f, *GridRBPtrF, charm_mass, strange_mass,      charm_mass, -1.0, 1, M5, b, c); | ||||||
|  |    | ||||||
|  | #ifdef EOFA_H | ||||||
|  |   MobiusEOFAFermionD Strange2_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , eofa_mass, eofa_mass, charm_mass , 0.0, -1, M5, b, c); | ||||||
|  |   MobiusEOFAFermionF Strange2_Op_LF(UF, *FGridF, *FrbGridF, *UGrid_f, *GridRBPtrF, eofa_mass, eofa_mass, charm_mass , 0.0, -1, M5, b, c); | ||||||
|  |   MobiusEOFAFermionD Strange2_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , charm_mass , eofa_mass,      charm_mass , -1.0, 1, M5, b, c); | ||||||
|  |   MobiusEOFAFermionF Strange2_Op_RF(UF, *FGridF, *FrbGridF, *UGrid_f, *GridRBPtrF, charm_mass , eofa_mass,      charm_mass , -1.0, 1, M5, b, c); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |   ConjugateGradient<FermionField>      ActionCG(ActionStoppingCondition,MaxCGIterations); | ||||||
|  |   ConjugateGradient<FermionField>  DerivativeCG(DerivativeStoppingCondition,MaxCGIterations); | ||||||
|  | #ifdef MIXED_PRECISION | ||||||
|  |   const int MX_inner = 50000; | ||||||
|  |  | ||||||
|  |   // Mixed precision EOFA | ||||||
|  |   LinearOperatorEOFAD Strange_LinOp_L (Strange_Op_L); | ||||||
|  |   LinearOperatorEOFAD Strange_LinOp_R (Strange_Op_R); | ||||||
|  |   LinearOperatorEOFAF Strange_LinOp_LF(Strange_Op_LF); | ||||||
|  |   LinearOperatorEOFAF Strange_LinOp_RF(Strange_Op_RF); | ||||||
|  |  | ||||||
|  | #ifdef EOFA_H | ||||||
|  |   // Mixed precision EOFA | ||||||
|  |   LinearOperatorEOFAD Strange2_LinOp_L (Strange2_Op_L); | ||||||
|  |   LinearOperatorEOFAD Strange2_LinOp_R (Strange2_Op_R); | ||||||
|  |   LinearOperatorEOFAF Strange2_LinOp_LF(Strange2_Op_LF); | ||||||
|  |   LinearOperatorEOFAF Strange2_LinOp_RF(Strange2_Op_RF); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |   MxPCG_EOFA ActionCGL(ActionStoppingCondition, | ||||||
|  | 		       MX_inner, | ||||||
|  | 		       MaxCGIterations, | ||||||
|  | 		       UGrid_f, | ||||||
|  | 		       FrbGridF, | ||||||
|  | 		       Strange_Op_LF,Strange_Op_L, | ||||||
|  | 		       Strange_LinOp_LF,Strange_LinOp_L); | ||||||
|  |  | ||||||
|  | #ifdef EOFA_H | ||||||
|  |   MxPCG_EOFA ActionCGL2(ActionStoppingCondition, | ||||||
|  | 		       MX_inner, | ||||||
|  | 		       MaxCGIterations, | ||||||
|  | 		       UGrid_f, | ||||||
|  | 		       FrbGridF, | ||||||
|  | 		       Strange2_Op_LF,Strange2_Op_L, | ||||||
|  | 		       Strange2_LinOp_LF,Strange2_LinOp_L); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |   MxPCG_EOFA DerivativeCGL(DerivativeStoppingCondition, | ||||||
|  | 			   MX_inner, | ||||||
|  | 			   MaxCGIterations, | ||||||
|  | 			   UGrid_f, | ||||||
|  | 			   FrbGridF, | ||||||
|  | 			   Strange_Op_LF,Strange_Op_L, | ||||||
|  | 			   Strange_LinOp_LF,Strange_LinOp_L); | ||||||
|  |  | ||||||
|  | #ifdef EOFA_H | ||||||
|  |   MxPCG_EOFA DerivativeCGL2(DerivativeStoppingCondition, | ||||||
|  | 			   MX_inner, | ||||||
|  | 			   MaxCGIterations, | ||||||
|  | 			   UGrid_f, | ||||||
|  | 			   FrbGridF, | ||||||
|  | 			   Strange2_Op_LF,Strange2_Op_L, | ||||||
|  | 			   Strange2_LinOp_LF,Strange2_LinOp_L); | ||||||
|  | #endif | ||||||
|  |    | ||||||
|  |   MxPCG_EOFA ActionCGR(ActionStoppingCondition, | ||||||
|  | 		       MX_inner, | ||||||
|  | 		       MaxCGIterations, | ||||||
|  | 		       UGrid_f, | ||||||
|  | 		       FrbGridF, | ||||||
|  | 		       Strange_Op_RF,Strange_Op_R, | ||||||
|  | 		       Strange_LinOp_RF,Strange_LinOp_R); | ||||||
|  |    | ||||||
|  | #ifdef EOFA_H | ||||||
|  |   MxPCG_EOFA ActionCGR2(ActionStoppingCondition, | ||||||
|  | 		       MX_inner, | ||||||
|  | 		       MaxCGIterations, | ||||||
|  | 		       UGrid_f, | ||||||
|  | 		       FrbGridF, | ||||||
|  | 		       Strange2_Op_RF,Strange2_Op_R, | ||||||
|  | 		       Strange2_LinOp_RF,Strange2_LinOp_R); | ||||||
|  | #endif | ||||||
|  |    | ||||||
|  |   MxPCG_EOFA DerivativeCGR(DerivativeStoppingCondition, | ||||||
|  | 			   MX_inner, | ||||||
|  | 			   MaxCGIterations, | ||||||
|  | 			   UGrid_f, | ||||||
|  | 			   FrbGridF, | ||||||
|  | 			   Strange_Op_RF,Strange_Op_R, | ||||||
|  | 			   Strange_LinOp_RF,Strange_LinOp_R); | ||||||
|  |    | ||||||
|  | #ifdef EOFA_H | ||||||
|  |   MxPCG_EOFA DerivativeCGR2(DerivativeStoppingCondition, | ||||||
|  | 			   MX_inner, | ||||||
|  | 			   MaxCGIterations, | ||||||
|  | 			   UGrid_f, | ||||||
|  | 			   FrbGridF, | ||||||
|  | 			   Strange2_Op_RF,Strange2_Op_R, | ||||||
|  | 			   Strange2_LinOp_RF,Strange2_LinOp_R); | ||||||
|  | #endif | ||||||
|  |    | ||||||
|  |   ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>  | ||||||
|  |     EOFA(Strange_Op_L, Strange_Op_R,  | ||||||
|  | 	 ActionCG,  | ||||||
|  | 	 ActionCGL, ActionCGR, | ||||||
|  | 	 DerivativeCGL, DerivativeCGR, | ||||||
|  | 	 OFRp, true); | ||||||
|  |    | ||||||
|  | #ifdef EOFA_H | ||||||
|  |   ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>  | ||||||
|  |     EOFA2(Strange2_Op_L, Strange2_Op_R,  | ||||||
|  | 	 ActionCG,  | ||||||
|  | 	 ActionCGL2, ActionCGR2, | ||||||
|  | 	 DerivativeCGL2, DerivativeCGR2, | ||||||
|  | 	 OFRp, true); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |   Level1.push_back(&EOFA); | ||||||
|  | #ifdef EOFA_H | ||||||
|  |   Level1.push_back(&EOFA2); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #else | ||||||
|  |   ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>  | ||||||
|  |     EOFA(Strange_Op_L, Strange_Op_R,  | ||||||
|  | 	 ActionCG,  | ||||||
|  | 	 ActionCG, ActionCG, | ||||||
|  | 	 ActionCG, ActionCG, | ||||||
|  | 	 //         DerivativeCG, DerivativeCG, | ||||||
|  | 	 OFRp, true); | ||||||
|  |   Level1.push_back(&EOFA); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |   //////////////////////////////////// | ||||||
|  |   // up down action | ||||||
|  |   //////////////////////////////////// | ||||||
|  |   std::vector<Real> light_den; | ||||||
|  |   std::vector<Real> light_num; | ||||||
|  |  | ||||||
|  |   int n_hasenbusch = hasenbusch.size(); | ||||||
|  |   light_den.push_back(light_mass); | ||||||
|  |   for(int h=0;h<n_hasenbusch;h++){ | ||||||
|  |     light_den.push_back(hasenbusch[h]); | ||||||
|  |     light_num.push_back(hasenbusch[h]); | ||||||
|  |   } | ||||||
|  |   light_num.push_back(pv_mass); | ||||||
|  |  | ||||||
|  |   int n_hasenbusch2 = hasenbusch2.size(); | ||||||
|  |   light_den.push_back(charm_mass); | ||||||
|  |   for(int h=0;h<n_hasenbusch2;h++){ | ||||||
|  |     light_den.push_back(hasenbusch2[h]); | ||||||
|  |     light_num.push_back(hasenbusch2[h]); | ||||||
|  |   } | ||||||
|  |   light_num.push_back(pv_mass); | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////////////////////// | ||||||
|  |   // Forced to replicate the MxPCG and DenominatorsF etc.. because | ||||||
|  |   // there is no convenient way to "Clone" physics params from double op | ||||||
|  |   // into single op for any operator pair. | ||||||
|  |   // Same issue prevents using MxPCG in the Heatbath step | ||||||
|  |   ////////////////////////////////////////////////////////////// | ||||||
|  |   std::vector<FermionAction *> Numerators; | ||||||
|  |   std::vector<FermionAction *> Denominators; | ||||||
|  |   std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients; | ||||||
|  |   std::vector<MxPCG *> ActionMPCG; | ||||||
|  |   std::vector<MxPCG *> MPCG; | ||||||
|  |   std::vector<FermionActionF *> DenominatorsF; | ||||||
|  |   std::vector<LinearOperatorD *> LinOpD; | ||||||
|  |   std::vector<LinearOperatorF *> LinOpF;  | ||||||
|  |  | ||||||
|  |   for(int h=0;h<light_den.size();h++){ | ||||||
|  |  | ||||||
|  |     std::cout << GridLogMessage << " 2f quotient Action  "<< light_num[h] << " / " << light_den[h]<< std::endl; | ||||||
|  |  | ||||||
|  |     Numerators.push_back  (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params)); | ||||||
|  |     Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params)); | ||||||
|  |  | ||||||
|  | #ifdef MIXED_PRECISION | ||||||
|  |     //////////////////////////////////////////////////////////////////////////// | ||||||
|  |     // Mixed precision CG for 2f force | ||||||
|  |     //////////////////////////////////////////////////////////////////////////// | ||||||
|  |     double DerivativeStoppingConditionLoose = 1e-8; | ||||||
|  |  | ||||||
|  |     DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*UGrid_f,*GridRBPtrF,light_den[h],M5,b,c, ParamsF)); | ||||||
|  |     LinOpD.push_back(new LinearOperatorD(*Denominators[h])); | ||||||
|  |     LinOpF.push_back(new LinearOperatorF(*DenominatorsF[h])); | ||||||
|  |  | ||||||
|  |     double conv  = DerivativeStoppingCondition; | ||||||
|  |     if (h<3) conv= DerivativeStoppingConditionLoose; // Relax on first two hasenbusch factors | ||||||
|  |     MPCG.push_back(new MxPCG(conv, | ||||||
|  | 			     MX_inner, | ||||||
|  | 			     MaxCGIterations, | ||||||
|  | 			     UGrid_f, | ||||||
|  | 			     FrbGridF, | ||||||
|  | 			     *DenominatorsF[h],*Denominators[h], | ||||||
|  | 			     *LinOpF[h], *LinOpD[h]) ); | ||||||
|  |  | ||||||
|  |     ActionMPCG.push_back(new MxPCG(ActionStoppingCondition, | ||||||
|  | 				   MX_inner, | ||||||
|  | 				   MaxCGIterations, | ||||||
|  | 				   UGrid_f, | ||||||
|  | 				   FrbGridF, | ||||||
|  | 				   *DenominatorsF[h],*Denominators[h], | ||||||
|  | 				   *LinOpF[h], *LinOpD[h]) ); | ||||||
|  |  | ||||||
|  |     // Heatbath not mixed yet. As inverts numerators not so important as raised mass. | ||||||
|  |     Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],ActionCG)); | ||||||
|  | #else | ||||||
|  |     //////////////////////////////////////////////////////////////////////////// | ||||||
|  |     // Standard CG for 2f force | ||||||
|  |     //////////////////////////////////////////////////////////////////////////// | ||||||
|  |     Quotients.push_back   (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],DerivativeCG,ActionCG)); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   for(int h=0;h<n_hasenbusch+1;h++){ | ||||||
|  |     Level1.push_back(Quotients[h]); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   ///////////////////////////////////////////////////////////// | ||||||
|  |   // Gauge action | ||||||
|  |   ///////////////////////////////////////////////////////////// | ||||||
|  |   Level2.push_back(&GaugeAction); | ||||||
|  |   TheHMC.TheAction.push_back(Level1); | ||||||
|  |   TheHMC.TheAction.push_back(Level2); | ||||||
|  |   std::cout << GridLogMessage << " Action complete "<< std::endl; | ||||||
|  |  | ||||||
|  |   ///////////////////////////////////////////////////////////// | ||||||
|  |   // HMC parameters are serialisable | ||||||
|  |  | ||||||
|  |   NoSmearing<HMCWrapper::ImplPolicy> S; | ||||||
|  | #ifndef DO_IMPLICIT | ||||||
|  |   TrivialMetric<HMCWrapper::ImplPolicy::Field> Mtr; | ||||||
|  | #else | ||||||
|  |     LaplacianRatParams gpar(2),mpar(2); | ||||||
|  |     gpar.offset = 1.; | ||||||
|  |     gpar.a0[0] = 500.; | ||||||
|  |     gpar.a1[0] = 0.; | ||||||
|  |     gpar.b0[0] = 0.25; | ||||||
|  |     gpar.b1[0] = 1.; | ||||||
|  |     gpar.a0[1] = -500.; | ||||||
|  |     gpar.a1[1] = 0.; | ||||||
|  |     gpar.b0[1] = 0.36; | ||||||
|  |     gpar.b1[1] = 1.2; | ||||||
|  |     gpar.b2=1.; | ||||||
|  |  | ||||||
|  |     mpar.offset = 1.; | ||||||
|  |     mpar.a0[0] =  -0.850891906532; | ||||||
|  |     mpar.a1[0] = -1.54707654538; | ||||||
|  |     mpar. b0[0] = 2.85557166137; | ||||||
|  |     mpar. b1[0] = 5.74194794773; | ||||||
|  |     mpar.a0[1] = -13.5120056831218384729709214298; | ||||||
|  |     mpar.a1[1] = 1.54707654538396877086370295729; | ||||||
|  |     mpar.b0[1] = 19.2921090880640520026645390317; | ||||||
|  |     mpar.b1[1] = -3.54194794773029020262811172870; | ||||||
|  |     mpar.b2=1.; | ||||||
|  |     for(int i=0;i<2;i++){ | ||||||
|  |        gpar.a1[i] *=16.; | ||||||
|  |        gpar.b1[i] *=16.; | ||||||
|  |        mpar.a1[i] *=16.; | ||||||
|  |        mpar.b1[i] *=16.; | ||||||
|  |     } | ||||||
|  |     gpar.b2 *= 16.*16.; | ||||||
|  |     mpar.b2 *= 16.*16.; | ||||||
|  |  | ||||||
|  |     ConjugateGradient<LatticeGaugeField> CG(1.0e-8,10000); | ||||||
|  |     LaplacianParams LapPar(0.0001, 1.0, 10000, 1e-8, 12, 64); | ||||||
|  |  | ||||||
|  |     std::cout << GridLogMessage << "LaplacianRat " << std::endl; | ||||||
|  |     gpar.tolerance=HMCparams.MD.RMHMCCGTol; | ||||||
|  |     mpar.tolerance=HMCparams.MD.RMHMCCGTol; | ||||||
|  |     std::cout << GridLogMessage << "gpar offset= " << gpar.offset <<std::endl; | ||||||
|  |     std::cout << GridLogMessage << " a0= " << gpar.a0 <<std::endl; | ||||||
|  |     std::cout << GridLogMessage << " a1= " << gpar.a1 <<std::endl; | ||||||
|  |     std::cout << GridLogMessage << " b0= " << gpar.b0 <<std::endl; | ||||||
|  |     std::cout << GridLogMessage << " b1= " << gpar.b1 <<std::endl; | ||||||
|  |     std::cout << GridLogMessage << " b2= " << gpar.b2 <<std::endl ;; | ||||||
|  |  | ||||||
|  |     std::cout << GridLogMessage << "mpar offset= " << mpar.offset <<std::endl; | ||||||
|  |     std::cout << GridLogMessage << " a0= " << mpar.a0 <<std::endl; | ||||||
|  |     std::cout << GridLogMessage << " a1= " << mpar.a1 <<std::endl; | ||||||
|  |     std::cout << GridLogMessage << " b0= " << mpar.b0 <<std::endl; | ||||||
|  |     std::cout << GridLogMessage << " b1= " << mpar.b1 <<std::endl; | ||||||
|  |     std::cout << GridLogMessage << " b2= " << mpar.b2 <<std::endl; | ||||||
|  | //  Assumes PeriodicGimplR or D at the moment | ||||||
|  |     auto UGrid = TheHMC.Resources.GetCartesian("gauge"); | ||||||
|  | //    auto UGrid_f   = GridPtrF; | ||||||
|  | //  auto GridPtrF   = SpaceTimeGrid::makeFourDimGrid(latt,simdF,mpi); | ||||||
|  | //    std::cout << GridLogMessage << " UGrid= " << UGrid <<std::endl; | ||||||
|  | //    std::cout << GridLogMessage << " UGrid_f= " << UGrid_f <<std::endl; | ||||||
|  |  | ||||||
|  |     LaplacianAdjointRat<HMCWrapper::ImplPolicy, PeriodicGimplF> Mtr(UGrid, UGrid_f ,CG, gpar, mpar); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << " Running the HMC "<< std::endl; | ||||||
|  |   TheHMC.Run(S,Mtr);  // no smearing | ||||||
|  |  | ||||||
|  |   Grid_finalize(); | ||||||
|  | } // main | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -1,22 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
| #PBS -q EarlyAppAccess |  | ||||||
| #PBS -l select=2 |  | ||||||
| #PBS -l walltime=01:00:00 |  | ||||||
| #PBS -A LatticeQCD_aesp_CNDA |  | ||||||
|  |  | ||||||
| export TZ='/usr/share/zoneinfo/US/Central' |  | ||||||
| export OMP_PROC_BIND=spread |  | ||||||
| export OMP_NUM_THREADS=3 |  | ||||||
| unset OMP_PLACES |  | ||||||
|  |  | ||||||
| cd $PBS_O_WORKDIR |  | ||||||
|  |  | ||||||
| NNODES=`wc -l < $PBS_NODEFILE` |  | ||||||
| NRANKS=12         # Number of MPI ranks per node |  | ||||||
| NDEPTH=4          # Number of hardware threads per rank, spacing between MPI ranks on a node |  | ||||||
| NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS |  | ||||||
|  |  | ||||||
| NTOTRANKS=$(( NNODES * NRANKS )) |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 2 -ppn 1  -envall ./gpu_tile_compact.sh ./halo_mpi --mpi 2.1.1.1" |  | ||||||
| $CMD |  | ||||||
| @@ -1 +0,0 @@ | |||||||
| mpicxx  -fsycl halo_mpi.cc -o halo_mpi |  | ||||||
| @@ -1,30 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| export NUMA_PMAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) |  | ||||||
| export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 ) |  | ||||||
| export  GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 ) |  | ||||||
| export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ) |  | ||||||
|  |  | ||||||
| export PNUMA=${NUMA_PMAP[$PALS_LOCAL_RANKID]} |  | ||||||
| export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} |  | ||||||
| export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} |  | ||||||
| export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} |  | ||||||
|    |  | ||||||
|  |  | ||||||
| export ZE_AFFINITY_MASK=$gpu_id.$tile_id |  | ||||||
| export ONEAPI_DEVICE_FILTER=gpu,level_zero |  | ||||||
|  |  | ||||||
| #unset EnableWalkerPartition |  | ||||||
| #export EnableImplicitScaling=0 |  | ||||||
| #export GRID_MPICH_NIC_BIND=$NIC |  | ||||||
| #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id |  | ||||||
| #export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 |  | ||||||
| #export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 |  | ||||||
| #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 |  | ||||||
| #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 |  | ||||||
| #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 |  | ||||||
| #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 |  | ||||||
|  |  | ||||||
| echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " |  | ||||||
|  |  | ||||||
| numactl -m $PNUMA -N $NUMA  "$@" |  | ||||||
| @@ -1,333 +0,0 @@ | |||||||
| #include <cassert> |  | ||||||
| #include <complex> |  | ||||||
| #include <memory> |  | ||||||
| #include <vector> |  | ||||||
| #include <algorithm> |  | ||||||
| #include <array> |  | ||||||
| #include <string> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <stdlib.h> |  | ||||||
| #include <strings.h> |  | ||||||
| #include <ctime> |  | ||||||
| #include <sys/time.h> |  | ||||||
|  |  | ||||||
| #include <mpi.h> |  | ||||||
|  |  | ||||||
| /************************************************************** |  | ||||||
|  * GPU - GPU memory cartesian halo exchange benchmark |  | ||||||
|  * Config: what is the target |  | ||||||
|  ************************************************************** |  | ||||||
|  */ |  | ||||||
| #undef ACC_CUDA |  | ||||||
| #undef  ACC_HIP |  | ||||||
| #define  ACC_SYCL |  | ||||||
| #undef  ACC_NONE |  | ||||||
|  |  | ||||||
| /************************************************************** |  | ||||||
|  * Some MPI globals |  | ||||||
|  ************************************************************** |  | ||||||
|  */ |  | ||||||
| MPI_Comm WorldComm; |  | ||||||
| MPI_Comm WorldShmComm; |  | ||||||
|  |  | ||||||
| int WorldSize; |  | ||||||
| int WorldRank; |  | ||||||
|  |  | ||||||
| int WorldShmSize; |  | ||||||
| int WorldShmRank; |  | ||||||
|  |  | ||||||
| /************************************************************** |  | ||||||
|  * Allocate buffers on the GPU, SYCL needs an init call and context |  | ||||||
|  ************************************************************** |  | ||||||
|  */ |  | ||||||
| #ifdef ACC_CUDA |  | ||||||
| #include <cuda.h> |  | ||||||
| void acceleratorInit(void){} |  | ||||||
| void *acceleratorAllocDevice(size_t bytes) |  | ||||||
| { |  | ||||||
|   void *ptr=NULL; |  | ||||||
|   auto err = cudaMalloc((void **)&ptr,bytes); |  | ||||||
|   assert(err==cudaSuccess); |  | ||||||
|   return ptr; |  | ||||||
| } |  | ||||||
| void acceleratorFreeDevice(void *ptr){  cudaFree(ptr);} |  | ||||||
| #endif |  | ||||||
| #ifdef ACC_HIP |  | ||||||
| #include <hip/hip_runtime.h> |  | ||||||
| void acceleratorInit(void){} |  | ||||||
| inline void *acceleratorAllocDevice(size_t bytes) |  | ||||||
| { |  | ||||||
|   void *ptr=NULL; |  | ||||||
|   auto err = hipMalloc((void **)&ptr,bytes); |  | ||||||
|   if( err != hipSuccess ) { |  | ||||||
|     ptr = (void *) NULL; |  | ||||||
|     printf(" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); |  | ||||||
|   } |  | ||||||
|   return ptr; |  | ||||||
| }; |  | ||||||
| inline void acceleratorFreeDevice(void *ptr){ auto r=hipFree(ptr);}; |  | ||||||
| #endif |  | ||||||
| #ifdef ACC_SYCL |  | ||||||
| #include <sycl/CL/sycl.hpp> |  | ||||||
| #include <sycl/usm.hpp> |  | ||||||
| cl::sycl::queue *theAccelerator; |  | ||||||
| void acceleratorInit(void) |  | ||||||
| { |  | ||||||
|   int nDevices = 1; |  | ||||||
| #if 1 |  | ||||||
|   cl::sycl::gpu_selector selector; |  | ||||||
|   cl::sycl::device selectedDevice { selector }; |  | ||||||
|   theAccelerator = new sycl::queue (selectedDevice); |  | ||||||
| #else |  | ||||||
|   cl::sycl::device selectedDevice {cl::sycl::gpu_selector_v  }; |  | ||||||
|   theAccelerator = new sycl::queue (selectedDevice); |  | ||||||
| #endif |  | ||||||
|   auto name = theAccelerator->get_device().get_info<sycl::info::device::name>(); |  | ||||||
|   printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str()); fflush(stdout); |  | ||||||
| } |  | ||||||
| inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theAccelerator);}; |  | ||||||
| inline void acceleratorFreeDevice(void *ptr){free(ptr,*theAccelerator);}; |  | ||||||
| #endif |  | ||||||
| #ifdef ACC_NONE |  | ||||||
| void acceleratorInit(void){} |  | ||||||
| inline void *acceleratorAllocDevice(size_t bytes){ return malloc(bytes);}; |  | ||||||
| inline void acceleratorFreeDevice(void *ptr){free(ptr);}; |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
|  |  | ||||||
| /************************************************************** |  | ||||||
|  * Microsecond timer |  | ||||||
|  ************************************************************** |  | ||||||
|  */ |  | ||||||
| inline double usecond(void) { |  | ||||||
|   struct timeval tv; |  | ||||||
|   gettimeofday(&tv,NULL); |  | ||||||
|   return 1.0e6*tv.tv_sec + 1.0*tv.tv_usec; |  | ||||||
| } |  | ||||||
| /************************************************************** |  | ||||||
|  * Main benchmark routine |  | ||||||
|  ************************************************************** |  | ||||||
|  */ |  | ||||||
| void Benchmark(int64_t L,std::vector<int> cart_geom,bool use_device,int ncall) |  | ||||||
| { |  | ||||||
|   int64_t words = 3*4*2; |  | ||||||
|   int64_t face,vol; |  | ||||||
|   int Nd=cart_geom.size(); |  | ||||||
|    |  | ||||||
|   /************************************************************** |  | ||||||
|    * L^Nd volume, L^(Nd-1) faces, 12 complex per site |  | ||||||
|    * Allocate memory for these |  | ||||||
|    ************************************************************** |  | ||||||
|    */ |  | ||||||
|   face=1; for( int d=0;d<Nd-1;d++) face = face*L; |  | ||||||
|   vol=1;  for( int d=0;d<Nd;d++) vol = vol*L; |  | ||||||
|  |  | ||||||
|    |  | ||||||
|   std::vector<void *> send_bufs; |  | ||||||
|   std::vector<void *> recv_bufs; |  | ||||||
|   size_t vw = face*words; |  | ||||||
|   size_t bytes = face*words*sizeof(double); |  | ||||||
|  |  | ||||||
|   if ( use_device ) { |  | ||||||
|     for(int d=0;d<2*Nd;d++){ |  | ||||||
|       send_bufs.push_back(acceleratorAllocDevice(bytes)); |  | ||||||
|       recv_bufs.push_back(acceleratorAllocDevice(bytes)); |  | ||||||
|     } |  | ||||||
|   } else { |  | ||||||
|     for(int d=0;d<2*Nd;d++){ |  | ||||||
|       send_bufs.push_back(malloc(bytes)); |  | ||||||
|       recv_bufs.push_back(malloc(bytes)); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   /********************************************************* |  | ||||||
|    * Build cartesian communicator |  | ||||||
|    ********************************************************* |  | ||||||
|    */ |  | ||||||
|   int ierr; |  | ||||||
|   int rank; |  | ||||||
|   std::vector<int> coor(Nd); |  | ||||||
|   MPI_Comm communicator; |  | ||||||
|   std::vector<int> periodic(Nd,1); |  | ||||||
|   MPI_Cart_create(WorldComm,Nd,&cart_geom[0],&periodic[0],0,&communicator); |  | ||||||
|   MPI_Comm_rank(communicator,&rank); |  | ||||||
|   MPI_Cart_coords(communicator,rank,Nd,&coor[0]); |  | ||||||
|  |  | ||||||
|   static int reported; |  | ||||||
|   if ( ! reported ) {  |  | ||||||
|     printf("World Rank %d Shm Rank %d CartCoor %d %d %d %d\n",WorldRank,WorldShmRank, |  | ||||||
| 	 coor[0],coor[1],coor[2],coor[3]); fflush(stdout); |  | ||||||
|     reported =1 ; |  | ||||||
|   } |  | ||||||
|   /********************************************************* |  | ||||||
|    * Perform halo exchanges |  | ||||||
|    ********************************************************* |  | ||||||
|    */ |  | ||||||
|   for(int d=0;d<Nd;d++){ |  | ||||||
|     if ( cart_geom[d]>1 ) { |  | ||||||
|       double t0=usecond(); |  | ||||||
|  |  | ||||||
|       int from,to; |  | ||||||
|        |  | ||||||
|       MPI_Barrier(communicator); |  | ||||||
|       for(int n=0;n<ncall;n++){ |  | ||||||
| 	 |  | ||||||
| 	void *xmit = (void *)send_bufs[d]; |  | ||||||
| 	void *recv = (void *)recv_bufs[d]; |  | ||||||
| 	 |  | ||||||
| 	ierr=MPI_Cart_shift(communicator,d,1,&from,&to); |  | ||||||
| 	assert(ierr==0); |  | ||||||
| 	 |  | ||||||
| 	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank, |  | ||||||
| 			  recv,bytes,MPI_CHAR,from, from, |  | ||||||
| 			  communicator,MPI_STATUS_IGNORE); |  | ||||||
| 	assert(ierr==0); |  | ||||||
| 	 |  | ||||||
| 	xmit = (void *)send_bufs[Nd+d]; |  | ||||||
| 	recv = (void *)recv_bufs[Nd+d]; |  | ||||||
| 	 |  | ||||||
| 	ierr=MPI_Cart_shift(communicator,d,-1,&from,&to); |  | ||||||
| 	assert(ierr==0); |  | ||||||
| 	 |  | ||||||
| 	ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,to,rank, |  | ||||||
| 			  recv,bytes,MPI_CHAR,from, from, |  | ||||||
| 			  communicator,MPI_STATUS_IGNORE); |  | ||||||
| 	assert(ierr==0); |  | ||||||
|       } |  | ||||||
|       MPI_Barrier(communicator); |  | ||||||
|  |  | ||||||
|       double t1=usecond(); |  | ||||||
|        |  | ||||||
|       double dbytes    = bytes*WorldShmSize; |  | ||||||
|       double xbytes    = dbytes*2.0*ncall; |  | ||||||
|       double rbytes    = xbytes; |  | ||||||
|       double bidibytes = xbytes+rbytes; |  | ||||||
|  |  | ||||||
|       if ( ! WorldRank ) { |  | ||||||
| 	printf("\t%12ld\t %12ld %16.0lf\n",L,bytes,bidibytes/(t1-t0)); fflush(stdout); |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|   /********************************************************* |  | ||||||
|    * Free memory |  | ||||||
|    ********************************************************* |  | ||||||
|    */ |  | ||||||
|   if ( use_device ) { |  | ||||||
|     for(int d=0;d<2*Nd;d++){ |  | ||||||
|       acceleratorFreeDevice(send_bufs[d]); |  | ||||||
|       acceleratorFreeDevice(recv_bufs[d]); |  | ||||||
|     } |  | ||||||
|   } else { |  | ||||||
|     for(int d=0;d<2*Nd;d++){ |  | ||||||
|       free(send_bufs[d]); |  | ||||||
|       free(recv_bufs[d]); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
|  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /************************************** |  | ||||||
|  * Command line junk |  | ||||||
|  **************************************/ |  | ||||||
|  |  | ||||||
| std::string CmdOptionPayload(char ** begin, char ** end, const std::string & option) |  | ||||||
| { |  | ||||||
|   char ** itr = std::find(begin, end, option); |  | ||||||
|   if (itr != end && ++itr != end) { |  | ||||||
|     std::string payload(*itr); |  | ||||||
|     return payload; |  | ||||||
|   } |  | ||||||
|   return std::string(""); |  | ||||||
| } |  | ||||||
| bool CmdOptionExists(char** begin, char** end, const std::string& option) |  | ||||||
| { |  | ||||||
|   return std::find(begin, end, option) != end; |  | ||||||
| } |  | ||||||
| void CmdOptionIntVector(const std::string &str,std::vector<int> & vec) |  | ||||||
| { |  | ||||||
|   vec.resize(0); |  | ||||||
|   std::stringstream ss(str); |  | ||||||
|   int i; |  | ||||||
|   while (ss >> i){ |  | ||||||
|     vec.push_back(i); |  | ||||||
|     if(std::ispunct(ss.peek())) |  | ||||||
|       ss.ignore(); |  | ||||||
|   } |  | ||||||
|   return; |  | ||||||
| } |  | ||||||
| /************************************** |  | ||||||
|  * Command line junk |  | ||||||
|  **************************************/ |  | ||||||
| int main(int argc, char **argv) |  | ||||||
| { |  | ||||||
|   std::string arg; |  | ||||||
|  |  | ||||||
|   acceleratorInit(); |  | ||||||
|  |  | ||||||
|   MPI_Init(&argc,&argv); |  | ||||||
|  |  | ||||||
|   WorldComm = MPI_COMM_WORLD; |  | ||||||
|    |  | ||||||
|   MPI_Comm_split_type(WorldComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); |  | ||||||
|  |  | ||||||
|   MPI_Comm_rank(WorldComm     ,&WorldRank); |  | ||||||
|   MPI_Comm_size(WorldComm     ,&WorldSize); |  | ||||||
|  |  | ||||||
|   MPI_Comm_rank(WorldShmComm     ,&WorldShmRank); |  | ||||||
|   MPI_Comm_size(WorldShmComm     ,&WorldShmSize); |  | ||||||
|  |  | ||||||
|   if ( WorldSize/WorldShmSize > 2) { |  | ||||||
|     printf("This benchmark is meant to run on at most two nodes only\n"); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   auto mpi =std::vector<int>({1,1,1,1}); |  | ||||||
|  |  | ||||||
|   if( CmdOptionExists(argv,argv+argc,"--mpi") ){ |  | ||||||
|     arg = CmdOptionPayload(argv,argv+argc,"--mpi"); |  | ||||||
|     CmdOptionIntVector(arg,mpi); |  | ||||||
|   } else { |  | ||||||
|     printf("Must specify --mpi <n1.n2.n3.n4> command line argument\n"); |  | ||||||
|     exit(0); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   if( !WorldRank ) { |  | ||||||
|     printf("***********************************\n"); |  | ||||||
|     printf("%d ranks\n",WorldSize);  |  | ||||||
|     printf("%d ranks-per-node\n",WorldShmSize); |  | ||||||
|     printf("%d nodes\n",WorldSize/WorldShmSize);fflush(stdout); |  | ||||||
|     printf("Cartesian layout: "); |  | ||||||
|     for(int d=0;d<mpi.size();d++){ |  | ||||||
|       printf("%d ",mpi[d]); |  | ||||||
|     } |  | ||||||
|     printf("\n");fflush(stdout); |  | ||||||
|     printf("***********************************\n"); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|    |  | ||||||
|   if( !WorldRank ) { |  | ||||||
|     printf("=========================================================\n"); |  | ||||||
|     printf("= Benchmarking HOST memory MPI performance               \n"); |  | ||||||
|     printf("=========================================================\n");fflush(stdout); |  | ||||||
|     printf("= L\t pkt bytes\t MB/s           \n"); |  | ||||||
|     printf("=========================================================\n");fflush(stdout); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   for(int L=16;L<=64;L+=4){ |  | ||||||
|     Benchmark(L,mpi,false,100); |  | ||||||
|   }   |  | ||||||
|  |  | ||||||
|   if( !WorldRank ) { |  | ||||||
|     printf("=========================================================\n"); |  | ||||||
|     printf("= Benchmarking DEVICE memory MPI performance             \n"); |  | ||||||
|     printf("=========================================================\n");fflush(stdout); |  | ||||||
|   } |  | ||||||
|   for(int L=16;L<=64;L+=4){ |  | ||||||
|     Benchmark(L,mpi,true,100); |  | ||||||
|   }   |  | ||||||
|  |  | ||||||
|   if( !WorldRank ) { |  | ||||||
|     printf("=========================================================\n"); |  | ||||||
|     printf("= DONE   \n"); |  | ||||||
|     printf("=========================================================\n"); |  | ||||||
|   } |  | ||||||
|   MPI_Finalize(); |  | ||||||
| } |  | ||||||
| @@ -365,9 +365,15 @@ public: | |||||||
|     GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5); |     GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5); | ||||||
|     std::cout << GridLogMessage << "Initialised RNGs" << std::endl; |     std::cout << GridLogMessage << "Initialised RNGs" << std::endl; | ||||||
|  |  | ||||||
|  | #if 1 | ||||||
|     typedef DomainWallFermionF Action; |     typedef DomainWallFermionF Action; | ||||||
|     typedef typename Action::FermionField Fermion; |     typedef typename Action::FermionField Fermion; | ||||||
|     typedef LatticeGaugeFieldF Gauge; |     typedef LatticeGaugeFieldF Gauge; | ||||||
|  | #else | ||||||
|  |     typedef GparityDomainWallFermionF Action; | ||||||
|  |     typedef typename Action::FermionField Fermion; | ||||||
|  |     typedef LatticeGaugeFieldF Gauge; | ||||||
|  | #endif | ||||||
|      |      | ||||||
|     ///////// Source preparation //////////// |     ///////// Source preparation //////////// | ||||||
|     Gauge Umu(UGrid);  SU<Nc>::HotConfiguration(RNG4,Umu);  |     Gauge Umu(UGrid);  SU<Nc>::HotConfiguration(RNG4,Umu);  | ||||||
| @@ -635,6 +641,170 @@ public: | |||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|     return mflops_best; |     return mflops_best; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   static double Laplace(int L) | ||||||
|  |   { | ||||||
|  |     double mflops; | ||||||
|  |     double mflops_best = 0; | ||||||
|  |     double mflops_worst= 0; | ||||||
|  |     std::vector<double> mflops_all; | ||||||
|  |  | ||||||
|  |     /////////////////////////////////////////////////////// | ||||||
|  |     // Set/Get the layout & grid size | ||||||
|  |     /////////////////////////////////////////////////////// | ||||||
|  |     int threads = GridThread::GetThreads(); | ||||||
|  |     Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); | ||||||
|  |     Coordinate local({L,L,L,L}); | ||||||
|  |     Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); | ||||||
|  |      | ||||||
|  |     GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, | ||||||
|  | 								       GridDefaultSimd(Nd,vComplex::Nsimd()), | ||||||
|  | 								       GridDefaultMpi()); | ||||||
|  |     uint64_t NP = TmpGrid->RankCount(); | ||||||
|  |     uint64_t NN = TmpGrid->NodeCount(); | ||||||
|  |     NN_global=NN; | ||||||
|  |     uint64_t SHM=NP/NN; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     ///////// Welcome message //////////// | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "Benchmark Laplace on "<<L<<"^4 local volume "<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |     ///////// Lattice Init //////////// | ||||||
|  |     GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); | ||||||
|  |     GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid); | ||||||
|  |      | ||||||
|  |     ///////// RNG Init //////////// | ||||||
|  |     std::vector<int> seeds4({1,2,3,4}); | ||||||
|  |     GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4); | ||||||
|  |     std::cout << GridLogMessage << "Initialised RNGs" << std::endl; | ||||||
|  |  | ||||||
|  |     RealD mass=0.1; | ||||||
|  |     RealD c1=9.0/8.0; | ||||||
|  |     RealD c2=-1.0/24.0; | ||||||
|  |     RealD u0=1.0; | ||||||
|  |  | ||||||
|  | //    typedef ImprovedStaggeredFermionF Action; | ||||||
|  | //    typedef typename Action::FermionField Fermion;  | ||||||
|  |     typedef LatticeGaugeFieldF Gauge; | ||||||
|  |      | ||||||
|  |     Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu);  | ||||||
|  |  | ||||||
|  | //    typename Action::ImplParams params; | ||||||
|  | //    Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params); | ||||||
|  |  | ||||||
|  | //  PeriodicGimplF | ||||||
|  |     typedef typename PeriodicGimplF::LinkField GaugeLinkFieldF; | ||||||
|  |  | ||||||
|  |     ///////// Source preparation //////////// | ||||||
|  |     GaugeLinkFieldF src   (FGrid); random(RNG4,src); | ||||||
|  | //    GaugeLinkFieldF src_e (FrbGrid); | ||||||
|  | //    GaugeLinkFieldF src_o (FrbGrid); | ||||||
|  | //    GaugeLinkFieldF r_e   (FrbGrid); | ||||||
|  | //    GaugeLinkFieldF r_o   (FrbGrid); | ||||||
|  |     GaugeLinkFieldF r_eo  (FGrid); | ||||||
|  |    | ||||||
|  |     { | ||||||
|  |  | ||||||
|  |  //     pickCheckerboard(Even,src_e,src); | ||||||
|  |  //     pickCheckerboard(Odd,src_o,src); | ||||||
|  |      | ||||||
|  |       const int num_cases = 1; | ||||||
|  |       std::string fmt("G/O/C  "); | ||||||
|  |        | ||||||
|  |       controls Cases [] = { | ||||||
|  | 	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  }, | ||||||
|  |       };  | ||||||
|  |  | ||||||
|  |       for(int c=0;c<num_cases;c++) { | ||||||
|  |         CovariantAdjointLaplacianStencil<PeriodicGimplF,typename PeriodicGimplF::LinkField> LapStencilF(FGrid); | ||||||
|  |         QuadLinearOperator<CovariantAdjointLaplacianStencil<PeriodicGimplF,typename PeriodicGimplF::LinkField>,PeriodicGimplF::LinkField> QuadOpF(LapStencilF,c2,c1,1.); | ||||||
|  |         LapStencilF.GaugeImport(Umu); | ||||||
|  | 	 | ||||||
|  |  | ||||||
|  | 	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap; | ||||||
|  | 	StaggeredKernelsStatic::Opt   = Cases[c].Opt; | ||||||
|  | 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); | ||||||
|  |        | ||||||
|  | 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  | 	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using Stencil Nc Laplace" <<std::endl; | ||||||
|  | 	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||||
|  | 	if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl; | ||||||
|  | 	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  | 	 | ||||||
|  | 	int nwarm = 10; | ||||||
|  | 	double t0=usecond(); | ||||||
|  | 	FGrid->Barrier(); | ||||||
|  | 	for(int i=0;i<nwarm;i++){ | ||||||
|  | //	  Ds.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  |           QuadOpF.HermOp(src,r_eo); | ||||||
|  | 	} | ||||||
|  | 	FGrid->Barrier(); | ||||||
|  | 	double t1=usecond(); | ||||||
|  | 	uint64_t ncall = 500; | ||||||
|  |  | ||||||
|  | 	FGrid->Broadcast(0,&ncall,sizeof(ncall)); | ||||||
|  |  | ||||||
|  | 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; | ||||||
|  |  | ||||||
|  | 	time_statistics timestat; | ||||||
|  | 	std::vector<double> t_time(ncall); | ||||||
|  | 	for(uint64_t i=0;i<ncall;i++){ | ||||||
|  | 	  t0=usecond(); | ||||||
|  | //	  Ds.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  |           QuadOpF.HermOp(src,r_eo); | ||||||
|  | 	  t1=usecond(); | ||||||
|  | 	  t_time[i] = t1-t0; | ||||||
|  | 	} | ||||||
|  | 	FGrid->Barrier(); | ||||||
|  | 	 | ||||||
|  | 	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|  | //	double flops=(1146.0*volume)/2; | ||||||
|  | 	double flops=(2*2*8*216.0*volume); | ||||||
|  | 	double mf_hi, mf_lo, mf_err; | ||||||
|  | 	 | ||||||
|  | 	timestat.statistics(t_time); | ||||||
|  | 	mf_hi = flops/timestat.min; | ||||||
|  | 	mf_lo = flops/timestat.max; | ||||||
|  | 	mf_err= flops/timestat.min * timestat.err/timestat.mean; | ||||||
|  |  | ||||||
|  | 	mflops = flops/timestat.mean; | ||||||
|  | 	mflops_all.push_back(mflops); | ||||||
|  | 	if ( mflops_best == 0   ) mflops_best = mflops; | ||||||
|  | 	if ( mflops_worst== 0   ) mflops_worst= mflops; | ||||||
|  | 	if ( mflops>mflops_best ) mflops_best = mflops; | ||||||
|  | 	if ( mflops<mflops_worst) mflops_worst= mflops; | ||||||
|  | 	 | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Quad mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Quad mflop/s per rank   "<< mflops/NP<<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Quad mflop/s per node   "<< mflops/NN<<std::endl; | ||||||
|  | 	FGrid->Barrier(); | ||||||
|  |        | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4  Quad Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4  Quad Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage <<fmt << std::endl; | ||||||
|  |       std::cout<<GridLogMessage ; | ||||||
|  | 	FGrid->Barrier(); | ||||||
|  |  | ||||||
|  |       for(int i=0;i<mflops_all.size();i++){ | ||||||
|  | 	std::cout<<mflops_all[i]/NN<<" ; " ; | ||||||
|  |       } | ||||||
|  |       std::cout<<std::endl; | ||||||
|  |     } | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     return mflops_best; | ||||||
|  |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -662,6 +832,7 @@ int main (int argc, char ** argv) | |||||||
|   std::vector<double> wilson; |   std::vector<double> wilson; | ||||||
|   std::vector<double> dwf4; |   std::vector<double> dwf4; | ||||||
|   std::vector<double> staggered; |   std::vector<double> staggered; | ||||||
|  |   std::vector<double> lap; | ||||||
|  |  | ||||||
|   int Ls=1; |   int Ls=1; | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
| @@ -688,12 +859,20 @@ int main (int argc, char ** argv) | |||||||
|     staggered.push_back(result); |     staggered.push_back(result); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << " Laplace QuadOp 4D " <<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   for(int l=0;l<L_list.size();l++){ | ||||||
|  |     double result = Benchmark::Laplace(L_list[l]) ; | ||||||
|  |     lap.push_back(result); | ||||||
|  |   } | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl; |   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl; | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered" <<std::endl; |   std::cout<<GridLogMessage << "L \t\t Wilson \t\t DWF4 \t\t Staggered \t\t Quad Laplace" <<std::endl; | ||||||
|   for(int l=0;l<L_list.size();l++){ |   for(int l=0;l<L_list.size();l++){ | ||||||
|     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl; |     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<< " \t\t "<< lap[l]<< std::endl; | ||||||
|   } |   } | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -90,11 +90,11 @@ int main (int argc, char ** argv) | |||||||
|   std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl; |   std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl; | ||||||
|  |  | ||||||
|   for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0; |   for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0; | ||||||
|   //  Dirichlet[0] = 0; |   Dirichlet[0] = 0; | ||||||
|   //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; |   Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; | ||||||
|   //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; |   Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; | ||||||
|   //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; |   Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; | ||||||
|   //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; |   Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; | ||||||
|  |  | ||||||
|   Benchmark(Ls,Dirichlet); |   Benchmark(Ls,Dirichlet); | ||||||
|  |  | ||||||
| @@ -105,11 +105,11 @@ int main (int argc, char ** argv) | |||||||
|   std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl; |   std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl; | ||||||
|  |  | ||||||
|   for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0; |   for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0; | ||||||
|   //  Dirichlet[0] = 0; |   Dirichlet[0] = 0; | ||||||
|   //  Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; |   Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0]; | ||||||
|   //  Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; |   Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1]; | ||||||
|   //  Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; |   Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2]; | ||||||
|   //  Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; |   Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3]; | ||||||
|    |    | ||||||
|   Benchmark(Ls,Dirichlet); |   Benchmark(Ls,Dirichlet); | ||||||
|  |  | ||||||
| @@ -185,7 +185,6 @@ void Benchmark(int Ls, Coordinate Dirichlet) | |||||||
|   GaugeField Umu(UGrid); |   GaugeField Umu(UGrid); | ||||||
|   GaugeField UmuCopy(UGrid); |   GaugeField UmuCopy(UGrid); | ||||||
|   SU<Nc>::HotConfiguration(RNG4,Umu); |   SU<Nc>::HotConfiguration(RNG4,Umu); | ||||||
|   //  SU<Nc>::ColdConfiguration(Umu); |  | ||||||
|   UmuCopy=Umu; |   UmuCopy=Umu; | ||||||
|   std::cout << GridLogMessage << "Random gauge initialised " << std::endl; |   std::cout << GridLogMessage << "Random gauge initialised " << std::endl; | ||||||
|  |  | ||||||
| @@ -308,14 +307,6 @@ void Benchmark(int Ls, Coordinate Dirichlet) | |||||||
|     if(( n2e>1.0e-4) ) { |     if(( n2e>1.0e-4) ) { | ||||||
|       std::cout<<GridLogMessage << "WRONG RESULT" << std::endl; |       std::cout<<GridLogMessage << "WRONG RESULT" << std::endl; | ||||||
|       FGrid->Barrier(); |       FGrid->Barrier(); | ||||||
|       std::cout<<GridLogMessage << "RESULT" << std::endl; |  | ||||||
|       //      std::cout << result<<std::endl; |  | ||||||
|       std::cout << norm2(result)<<std::endl; |  | ||||||
|       std::cout<<GridLogMessage << "REF" << std::endl; |  | ||||||
|       std::cout << norm2(ref)<<std::endl; |  | ||||||
|       std::cout<<GridLogMessage << "ERR" << std::endl; |  | ||||||
|       std::cout << norm2(err)<<std::endl; |  | ||||||
|       FGrid->Barrier(); |  | ||||||
|       exit(-1); |       exit(-1); | ||||||
|     } |     } | ||||||
|     assert (n2e< 1.0e-4 ); |     assert (n2e< 1.0e-4 ); | ||||||
|   | |||||||
| @@ -1,968 +0,0 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid  |  | ||||||
|  |  | ||||||
|     Source file: ./benchmarks/Benchmark_usqcd.cc |  | ||||||
|  |  | ||||||
|     Copyright (C) 2015 |  | ||||||
|  |  | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |  | ||||||
| Author: paboyle <paboyle@ph.ed.ac.uk> |  | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |  | ||||||
|     it under the terms of the GNU General Public License as published by |  | ||||||
|     the Free Software Foundation; either version 2 of the License, or |  | ||||||
|     (at your option) any later version. |  | ||||||
|  |  | ||||||
|     This program is distributed in the hope that it will be useful, |  | ||||||
|     but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
|     GNU General Public License for more details. |  | ||||||
|  |  | ||||||
|     You should have received a copy of the GNU General Public License along |  | ||||||
|     with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
|     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
|     *************************************************************************************/ |  | ||||||
|     /*  END LEGAL */ |  | ||||||
| #include <Grid/Grid.h> |  | ||||||
| #include <Grid/algorithms/blas/BatchedBlas.h> |  | ||||||
|  |  | ||||||
| using namespace Grid; |  | ||||||
|  |  | ||||||
| std::vector<int> L_list; |  | ||||||
| std::vector<int> Ls_list; |  | ||||||
| std::vector<double> mflop_list; |  | ||||||
|  |  | ||||||
| double mflop_ref; |  | ||||||
| double mflop_ref_err; |  | ||||||
|  |  | ||||||
| int NN_global; |  | ||||||
|  |  | ||||||
| FILE * FP; |  | ||||||
|  |  | ||||||
| struct time_statistics{ |  | ||||||
|   double mean; |  | ||||||
|   double err; |  | ||||||
|   double min; |  | ||||||
|   double max; |  | ||||||
|  |  | ||||||
|   void statistics(std::vector<double> v){ |  | ||||||
|       double sum = std::accumulate(v.begin(), v.end(), 0.0); |  | ||||||
|       mean = sum / v.size(); |  | ||||||
|  |  | ||||||
|       std::vector<double> diff(v.size()); |  | ||||||
|       std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); |  | ||||||
|       double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); |  | ||||||
|       err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); |  | ||||||
|  |  | ||||||
|       auto result = std::minmax_element(v.begin(), v.end()); |  | ||||||
|       min = *result.first; |  | ||||||
|       max = *result.second; |  | ||||||
| } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| void comms_header(){ |  | ||||||
|   std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t" |  | ||||||
|             <<"bytes\t MB/s uni  \t\t MB/s bidi "<<std::endl; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| struct controls { |  | ||||||
|   int Opt; |  | ||||||
|   int CommsOverlap; |  | ||||||
|   Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch; |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| class Benchmark { |  | ||||||
| public: |  | ||||||
|   static void Decomposition (void ) { |  | ||||||
|  |  | ||||||
|     int threads = GridThread::GetThreads(); |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n"; |  | ||||||
|     std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage<<"\tvReal          : "<<sizeof(vReal )*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage<<"\tvComplex       : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|  |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   static void Comms(void) |  | ||||||
|   { |  | ||||||
|     int Nloop=200; |  | ||||||
|     int nmu=0; |  | ||||||
|     int maxlat=32; |  | ||||||
|  |  | ||||||
|     Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); |  | ||||||
|     Coordinate mpi_layout  = GridDefaultMpi(); |  | ||||||
|  |  | ||||||
|     for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++; |  | ||||||
|  |  | ||||||
|     std::vector<double> t_time(Nloop); |  | ||||||
|     time_statistics timestat; |  | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |  | ||||||
|     comms_header(); |  | ||||||
|  |  | ||||||
|     fprintf(FP,"Communications\n\n"); |  | ||||||
|     fprintf(FP,"Packet bytes, direction, GB/s per node\n"); |  | ||||||
|     for(int lat=16;lat<=maxlat;lat+=8){ |  | ||||||
|       //      for(int Ls=8;Ls<=8;Ls*=2){ |  | ||||||
|       { int Ls=12; |  | ||||||
|  |  | ||||||
| 	Coordinate latt_size  ({lat*mpi_layout[0], |  | ||||||
| 	      lat*mpi_layout[1], |  | ||||||
| 	      lat*mpi_layout[2], |  | ||||||
| 	      lat*mpi_layout[3]}); |  | ||||||
|  |  | ||||||
| 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |  | ||||||
| 	RealD Nrank = Grid._Nprocessors; |  | ||||||
| 	RealD Nnode = Grid.NodeCount(); |  | ||||||
| 	RealD ppn = Nrank/Nnode; |  | ||||||
|  |  | ||||||
| 	std::vector<HalfSpinColourVectorD *> xbuf(8); |  | ||||||
| 	std::vector<HalfSpinColourVectorD *> rbuf(8); |  | ||||||
| 	//Grid.ShmBufferFreeAll(); |  | ||||||
| 	uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); |  | ||||||
| 	for(int d=0;d<8;d++){ |  | ||||||
| 	  xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); |  | ||||||
| 	  rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); |  | ||||||
| 	  //	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); |  | ||||||
| 	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); |  | ||||||
| 	} |  | ||||||
|  |  | ||||||
| 	//	int ncomm; |  | ||||||
| 	double dbytes; |  | ||||||
|  |  | ||||||
|         for(int dir=0;dir<8;dir++) { |  | ||||||
| 	  int mu =dir % 4; |  | ||||||
| 	  if (mpi_layout[mu]>1 ) { |  | ||||||
|  |  | ||||||
| 	    std::vector<double> times(Nloop); |  | ||||||
| 	    for(int i=0;i<Nloop;i++){ |  | ||||||
|  |  | ||||||
| 	      dbytes=0;	         |  | ||||||
| 	      double start=usecond(); |  | ||||||
| 	      int xmit_to_rank; |  | ||||||
| 	      int recv_from_rank; |  | ||||||
|  |  | ||||||
| 	      if ( dir == mu ) {  |  | ||||||
| 		int comm_proc=1; |  | ||||||
| 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); |  | ||||||
| 	      } else {  |  | ||||||
| 		int comm_proc = mpi_layout[mu]-1; |  | ||||||
| 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); |  | ||||||
| 	      } |  | ||||||
| 	      Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, |  | ||||||
| 				  (void *)&rbuf[dir][0], recv_from_rank, |  | ||||||
| 				  bytes); |  | ||||||
| 	      dbytes+=bytes; |  | ||||||
| 	      |  | ||||||
| 	      double stop=usecond(); |  | ||||||
| 	      t_time[i] = stop-start; // microseconds |  | ||||||
|  |  | ||||||
| 	    } |  | ||||||
| 	    timestat.statistics(t_time); |  | ||||||
| 	   |  | ||||||
| 	    dbytes=dbytes*ppn; |  | ||||||
| 	    double xbytes    = dbytes*0.5; |  | ||||||
| 	    double bidibytes = dbytes; |  | ||||||
| 	   |  | ||||||
| 	    std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t " |  | ||||||
| 		     << bytes << " \t " |  | ||||||
| 		     <<xbytes/timestat.mean |  | ||||||
| 		     << "\t\t" |  | ||||||
| 		     << bidibytes/timestat.mean<< std::endl; |  | ||||||
| 	    fprintf(FP,"%ld, %d, %f\n",(long)bytes,dir,bidibytes/timestat.mean/1000.); |  | ||||||
| 	  } |  | ||||||
| 	} |  | ||||||
| 	for(int d=0;d<8;d++){ |  | ||||||
| 	  acceleratorFreeDevice(xbuf[d]); |  | ||||||
| 	  acceleratorFreeDevice(rbuf[d]); |  | ||||||
| 	} |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     fprintf(FP,"\n\n"); |  | ||||||
|      |  | ||||||
|     return; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|    |  | ||||||
|   static void Memory(void) |  | ||||||
|   { |  | ||||||
|     const int Nvec=8; |  | ||||||
|     typedef Lattice< iVector< vReal,Nvec> > LatticeVec; |  | ||||||
|     typedef iVector<vReal,Nvec> Vec; |  | ||||||
|  |  | ||||||
|     Coordinate simd_layout = GridDefaultSimd(Nd,vReal::Nsimd()); |  | ||||||
|     Coordinate mpi_layout  = GridDefaultMpi(); |  | ||||||
|  |  | ||||||
|     fprintf(FP,"Memory Bandwidth\n\n"); |  | ||||||
|     fprintf(FP,"Bytes, GB/s per node\n"); |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |  | ||||||
|    |  | ||||||
|     //    uint64_t NP; |  | ||||||
|     uint64_t NN; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   uint64_t lmax=40; |  | ||||||
| #define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat) |  | ||||||
|  |  | ||||||
|     GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); |  | ||||||
|     for(int lat=8;lat<=lmax;lat+=8){ |  | ||||||
|  |  | ||||||
|       Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |  | ||||||
|       int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |  | ||||||
|  |  | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |  | ||||||
|  |  | ||||||
|       //      NP= Grid.RankCount(); |  | ||||||
|       NN =Grid.NodeCount(); |  | ||||||
|  |  | ||||||
|       Vec rn ; random(sRNG,rn); |  | ||||||
|  |  | ||||||
|       LatticeVec z(&Grid); z=Zero(); |  | ||||||
|       LatticeVec x(&Grid); x=Zero(); |  | ||||||
|       LatticeVec y(&Grid); y=Zero(); |  | ||||||
|       double a=2.0; |  | ||||||
|  |  | ||||||
|       uint64_t Nloop=NLOOP; |  | ||||||
|  |  | ||||||
|       double start=usecond(); |  | ||||||
|       for(int i=0;i<Nloop;i++){ |  | ||||||
| 	z=a*x-y; |  | ||||||
|       } |  | ||||||
|       double stop=usecond(); |  | ||||||
|       double time = (stop-start)/Nloop*1000; |  | ||||||
|       |  | ||||||
|       double flops=vol*Nvec*2;// mul,add |  | ||||||
|       double bytes=3.0*vol*Nvec*sizeof(Real); |  | ||||||
|       std::cout<<GridLogMessage<<std::setprecision(3)  |  | ||||||
| 	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000. |  | ||||||
| 	       << "\t\t"<< bytes/time/NN <<std::endl; |  | ||||||
|  |  | ||||||
|       fprintf(FP,"%ld, %f\n",(long)bytes,bytes/time/NN); |  | ||||||
|  |  | ||||||
|     } |  | ||||||
|     fprintf(FP,"\n\n"); |  | ||||||
|   }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   static void BLAS(void) |  | ||||||
|   { |  | ||||||
|     //int nbasis, int nrhs, int coarseVol |  | ||||||
|     int  basis[] = { 16,32,64 }; |  | ||||||
|     int  rhs[]   = { 8,16,32 }; |  | ||||||
|     int  vol  = 4*4*4*4; |  | ||||||
|  |  | ||||||
|     GridBLAS blas; |  | ||||||
|      |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |  | ||||||
|    |  | ||||||
|     fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n"); |  | ||||||
|  |  | ||||||
|     for(int b=0;b<3;b++){ |  | ||||||
|     for(int r=0;r<3;r++){ |  | ||||||
|       int M=basis[b]; |  | ||||||
|       int N=rhs[r]; |  | ||||||
|       int K=basis[b]; |  | ||||||
|       int BATCH=vol; |  | ||||||
|       double p=blas.benchmark(M,N,K,BATCH); |  | ||||||
|  |  | ||||||
|       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); |  | ||||||
|        |  | ||||||
|       std::cout<<GridLogMessage<<std::setprecision(3)  |  | ||||||
| 	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl; |  | ||||||
|     }} |  | ||||||
|     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block project)"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |  | ||||||
|     for(int b=0;b<3;b++){ |  | ||||||
|     for(int r=0;r<3;r++){ |  | ||||||
|       int M=basis[b]; |  | ||||||
|       int N=rhs[r]; |  | ||||||
|       int K=vol; |  | ||||||
|       int BATCH=vol; |  | ||||||
|       double p=blas.benchmark(M,N,K,BATCH); |  | ||||||
|  |  | ||||||
|       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); |  | ||||||
|       std::cout<<GridLogMessage<<std::setprecision(3)  |  | ||||||
| 	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl; |  | ||||||
|     }} |  | ||||||
|     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "  M  "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block promote)"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |  | ||||||
|     for(int b=0;b<3;b++){ |  | ||||||
|     for(int r=0;r<3;r++){ |  | ||||||
|       int M=rhs[r]; |  | ||||||
|       int N=vol; |  | ||||||
|       int K=basis[b]; |  | ||||||
|       int BATCH=vol; |  | ||||||
|       double p=blas.benchmark(M,N,K,BATCH); |  | ||||||
|  |  | ||||||
|       fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p); |  | ||||||
|       std::cout<<GridLogMessage<<std::setprecision(3)  |  | ||||||
| 	       << M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl; |  | ||||||
|     }} |  | ||||||
|     fprintf(FP,"\n\n\n"); |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|   }; |  | ||||||
|    |  | ||||||
|  |  | ||||||
|   static void SU4(void) |  | ||||||
|   { |  | ||||||
|     const int Nc4=4; |  | ||||||
|     typedef Lattice< iMatrix< vComplexF,Nc4> > LatticeSU4; |  | ||||||
|  |  | ||||||
|     Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); |  | ||||||
|     Coordinate mpi_layout  = GridDefaultMpi(); |  | ||||||
|      |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |  | ||||||
|    |  | ||||||
|     uint64_t NN; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     uint64_t lmax=32; |  | ||||||
|  |  | ||||||
|     GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); |  | ||||||
|     for(int lat=8;lat<=lmax;lat+=8){ |  | ||||||
|  |  | ||||||
|       Coordinate latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |  | ||||||
|       int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |  | ||||||
|  |  | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |  | ||||||
|  |  | ||||||
|       NN =Grid.NodeCount(); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|       LatticeSU4 z(&Grid); z=Zero(); |  | ||||||
|       LatticeSU4 x(&Grid); x=Zero(); |  | ||||||
|       LatticeSU4 y(&Grid); y=Zero(); |  | ||||||
|       //      double a=2.0; |  | ||||||
|  |  | ||||||
|       uint64_t Nloop=NLOOP; |  | ||||||
|  |  | ||||||
|       double start=usecond(); |  | ||||||
|       for(int i=0;i<Nloop;i++){ |  | ||||||
| 	z=x*y; |  | ||||||
|       } |  | ||||||
|       double stop=usecond(); |  | ||||||
|       double time = (stop-start)/Nloop*1000; |  | ||||||
|       |  | ||||||
|       double flops=vol*Nc4*Nc4*(6+(Nc4-1)*8);// mul,add |  | ||||||
|       double bytes=3.0*vol*Nc4*Nc4*2*sizeof(RealF); |  | ||||||
|       std::cout<<GridLogMessage<<std::setprecision(3)  |  | ||||||
| 	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000. |  | ||||||
| 	       << "\t\t"<< bytes/time/NN <<std::endl; |  | ||||||
|  |  | ||||||
|     } |  | ||||||
|   }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   static double DWF(int Ls,int L) |  | ||||||
|   { |  | ||||||
|     RealD mass=0.1; |  | ||||||
|     RealD M5  =1.8; |  | ||||||
|  |  | ||||||
|     double mflops; |  | ||||||
|     double mflops_best = 0; |  | ||||||
|     double mflops_worst= 0; |  | ||||||
|     std::vector<double> mflops_all; |  | ||||||
|  |  | ||||||
|     /////////////////////////////////////////////////////// |  | ||||||
|     // Set/Get the layout & grid size |  | ||||||
|     /////////////////////////////////////////////////////// |  | ||||||
|     int threads = GridThread::GetThreads(); |  | ||||||
|     Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); |  | ||||||
|     Coordinate local({L,L,L,L}); |  | ||||||
|     Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); |  | ||||||
|  |  | ||||||
|     GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4,  |  | ||||||
| 								       GridDefaultSimd(Nd,vComplex::Nsimd()), |  | ||||||
| 								       GridDefaultMpi()); |  | ||||||
|     uint64_t NP = TmpGrid->RankCount(); |  | ||||||
|     uint64_t NN = TmpGrid->NodeCount(); |  | ||||||
|     NN_global=NN; |  | ||||||
|     uint64_t SHM=NP/NN; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     ///////// Welcome message //////////// |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* Nc             : "<<Nc<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|  |  | ||||||
|     ///////// Lattice Init //////////// |  | ||||||
|     GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); |  | ||||||
|     GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); |  | ||||||
|     GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); |  | ||||||
|     GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); |  | ||||||
|  |  | ||||||
|      |  | ||||||
|     ///////// RNG Init //////////// |  | ||||||
|     std::vector<int> seeds4({1,2,3,4}); |  | ||||||
|     std::vector<int> seeds5({5,6,7,8}); |  | ||||||
|     GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); |  | ||||||
|     GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5); |  | ||||||
|     std::cout << GridLogMessage << "Initialised RNGs" << std::endl; |  | ||||||
|  |  | ||||||
|     typedef DomainWallFermionF Action; |  | ||||||
|     typedef typename Action::FermionField Fermion; |  | ||||||
|     typedef LatticeGaugeFieldF Gauge; |  | ||||||
|      |  | ||||||
|     ///////// Source preparation //////////// |  | ||||||
|     Gauge Umu(UGrid);  SU<Nc>::HotConfiguration(RNG4,Umu);  |  | ||||||
|     Fermion src   (FGrid); random(RNG5,src); |  | ||||||
|     Fermion src_e (FrbGrid); |  | ||||||
|     Fermion src_o (FrbGrid); |  | ||||||
|     Fermion r_e   (FrbGrid); |  | ||||||
|     Fermion r_o   (FrbGrid); |  | ||||||
|     Fermion r_eo  (FGrid); |  | ||||||
|     Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); |  | ||||||
|  |  | ||||||
|     { |  | ||||||
|  |  | ||||||
|       pickCheckerboard(Even,src_e,src); |  | ||||||
|       pickCheckerboard(Odd,src_o,src); |  | ||||||
|  |  | ||||||
| #ifdef AVX512 |  | ||||||
|       const int num_cases = 3; |  | ||||||
| #else  |  | ||||||
|       const int num_cases = 2; |  | ||||||
| #endif       |  | ||||||
|       std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); |  | ||||||
|  |  | ||||||
|       controls Cases [] = { |  | ||||||
| 	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent }, |  | ||||||
| 	{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent }, |  | ||||||
| 	{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent } |  | ||||||
|       };  |  | ||||||
|  |  | ||||||
|       for(int c=0;c<num_cases;c++) { |  | ||||||
| 	 |  | ||||||
| 	WilsonKernelsStatic::Comms = Cases[c].CommsOverlap; |  | ||||||
| 	WilsonKernelsStatic::Opt   = Cases[c].Opt; |  | ||||||
| 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); |  | ||||||
|  |  | ||||||
| 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
| 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; |  | ||||||
| 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using ASM      WilsonKernels" <<std::endl; |  | ||||||
| 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using UNROLLED WilsonKernels" <<std::endl; |  | ||||||
| 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; |  | ||||||
| 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl; |  | ||||||
| 	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; |  | ||||||
| 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|  |  | ||||||
| 	int nwarm = 10; |  | ||||||
| 	double t0=usecond(); |  | ||||||
| 	FGrid->Barrier(); |  | ||||||
| 	for(int i=0;i<nwarm;i++){ |  | ||||||
| 	  Dw.DhopEO(src_o,r_e,DaggerNo); |  | ||||||
| 	} |  | ||||||
| 	FGrid->Barrier(); |  | ||||||
| 	double t1=usecond(); |  | ||||||
| 	uint64_t ncall = 500; |  | ||||||
|  |  | ||||||
| 	FGrid->Broadcast(0,&ncall,sizeof(ncall)); |  | ||||||
|  |  | ||||||
| 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; |  | ||||||
|  |  | ||||||
| 	time_statistics timestat; |  | ||||||
| 	std::vector<double> t_time(ncall); |  | ||||||
| 	for(uint64_t i=0;i<ncall;i++){ |  | ||||||
| 	  t0=usecond(); |  | ||||||
| 	  Dw.DhopEO(src_o,r_e,DaggerNo); |  | ||||||
| 	  t1=usecond(); |  | ||||||
| 	  t_time[i] = t1-t0; |  | ||||||
| 	} |  | ||||||
| 	FGrid->Barrier(); |  | ||||||
| 	 |  | ||||||
| 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |  | ||||||
|  |  | ||||||
| 	// Nc=3 gives |  | ||||||
| 	// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8 |  | ||||||
| 	// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2  + Nd*Nc*Ns*2 |  | ||||||
| 	//	double flops=(1344.0*volume)/2; |  | ||||||
| 	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns  + 2*Nd*Nc*Ns*2; |  | ||||||
|  |  | ||||||
| 	double flops=(fps*volume)/2; |  | ||||||
| 	double mf_hi, mf_lo, mf_err; |  | ||||||
|  |  | ||||||
| 	timestat.statistics(t_time); |  | ||||||
| 	mf_hi = flops/timestat.min; |  | ||||||
| 	mf_lo = flops/timestat.max; |  | ||||||
| 	mf_err= flops/timestat.min * timestat.err/timestat.mean; |  | ||||||
|  |  | ||||||
| 	mflops = flops/timestat.mean; |  | ||||||
| 	mflops_all.push_back(mflops); |  | ||||||
| 	if ( mflops_best == 0   ) mflops_best = mflops; |  | ||||||
| 	if ( mflops_worst== 0   ) mflops_worst= mflops; |  | ||||||
| 	if ( mflops>mflops_best ) mflops_best = mflops; |  | ||||||
| 	if ( mflops<mflops_worst) mflops_worst= mflops; |  | ||||||
|  |  | ||||||
| 	std::cout<<GridLogMessage<< "Deo FlopsPerSite is "<<fps<<std::endl; |  | ||||||
| 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl; |  | ||||||
| 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl; |  | ||||||
| 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl; |  | ||||||
|  |  | ||||||
|       } |  | ||||||
|  |  | ||||||
|       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl; |  | ||||||
|       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl; |  | ||||||
|       std::cout<<GridLogMessage <<fmt << std::endl; |  | ||||||
|       std::cout<<GridLogMessage ; |  | ||||||
|  |  | ||||||
|       for(int i=0;i<mflops_all.size();i++){ |  | ||||||
| 	std::cout<<mflops_all[i]/NN<<" ; " ; |  | ||||||
|       } |  | ||||||
|       std::cout<<std::endl; |  | ||||||
|       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|  |  | ||||||
|     } |  | ||||||
|     return mflops_best; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   static double Staggered(int L) |  | ||||||
|   { |  | ||||||
|     double mflops; |  | ||||||
|     double mflops_best = 0; |  | ||||||
|     double mflops_worst= 0; |  | ||||||
|     std::vector<double> mflops_all; |  | ||||||
|  |  | ||||||
|     /////////////////////////////////////////////////////// |  | ||||||
|     // Set/Get the layout & grid size |  | ||||||
|     /////////////////////////////////////////////////////// |  | ||||||
|     int threads = GridThread::GetThreads(); |  | ||||||
|     Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); |  | ||||||
|     Coordinate local({L,L,L,L}); |  | ||||||
|     Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); |  | ||||||
|      |  | ||||||
|     GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, |  | ||||||
| 								       GridDefaultSimd(Nd,vComplex::Nsimd()), |  | ||||||
| 								       GridDefaultMpi()); |  | ||||||
|     uint64_t NP = TmpGrid->RankCount(); |  | ||||||
|     uint64_t NN = TmpGrid->NodeCount(); |  | ||||||
|     NN_global=NN; |  | ||||||
|     uint64_t SHM=NP/NN; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     ///////// Welcome message //////////// |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|  |  | ||||||
|     ///////// Lattice Init //////////// |  | ||||||
|     GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); |  | ||||||
|     GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid); |  | ||||||
|      |  | ||||||
|     ///////// RNG Init //////////// |  | ||||||
|     std::vector<int> seeds4({1,2,3,4}); |  | ||||||
|     GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4); |  | ||||||
|     std::cout << GridLogMessage << "Initialised RNGs" << std::endl; |  | ||||||
|  |  | ||||||
|     RealD mass=0.1; |  | ||||||
|     RealD c1=9.0/8.0; |  | ||||||
|     RealD c2=-1.0/24.0; |  | ||||||
|     RealD u0=1.0; |  | ||||||
|  |  | ||||||
|     typedef ImprovedStaggeredFermionF Action; |  | ||||||
|     typedef typename Action::FermionField Fermion;  |  | ||||||
|     typedef LatticeGaugeFieldF Gauge; |  | ||||||
|      |  | ||||||
|     Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu);  |  | ||||||
|  |  | ||||||
|     typename Action::ImplParams params; |  | ||||||
|     Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params); |  | ||||||
|  |  | ||||||
|     ///////// Source preparation //////////// |  | ||||||
|     Fermion src   (FGrid); random(RNG4,src); |  | ||||||
|     Fermion src_e (FrbGrid); |  | ||||||
|     Fermion src_o (FrbGrid); |  | ||||||
|     Fermion r_e   (FrbGrid); |  | ||||||
|     Fermion r_o   (FrbGrid); |  | ||||||
|     Fermion r_eo  (FGrid); |  | ||||||
|    |  | ||||||
|     { |  | ||||||
|  |  | ||||||
|       pickCheckerboard(Even,src_e,src); |  | ||||||
|       pickCheckerboard(Odd,src_o,src); |  | ||||||
|      |  | ||||||
|       const int num_cases = 2; |  | ||||||
|       std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); |  | ||||||
|        |  | ||||||
|       controls Cases [] = { |  | ||||||
| 	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  }, |  | ||||||
| 	{  StaggeredKernelsStatic::OptHandUnroll,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  }, |  | ||||||
| 	{  StaggeredKernelsStatic::OptInlineAsm ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  } |  | ||||||
|       };  |  | ||||||
|  |  | ||||||
|       for(int c=0;c<num_cases;c++) { |  | ||||||
| 	 |  | ||||||
| 	StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap; |  | ||||||
| 	StaggeredKernelsStatic::Opt   = Cases[c].Opt; |  | ||||||
| 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); |  | ||||||
|        |  | ||||||
| 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
| 	if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl; |  | ||||||
| 	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; |  | ||||||
| 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
| 	 |  | ||||||
| 	int nwarm = 10; |  | ||||||
| 	double t0=usecond(); |  | ||||||
| 	FGrid->Barrier(); |  | ||||||
| 	for(int i=0;i<nwarm;i++){ |  | ||||||
| 	  Ds.DhopEO(src_o,r_e,DaggerNo); |  | ||||||
| 	} |  | ||||||
| 	FGrid->Barrier(); |  | ||||||
| 	double t1=usecond(); |  | ||||||
| 	uint64_t ncall = 500; |  | ||||||
|  |  | ||||||
| 	FGrid->Broadcast(0,&ncall,sizeof(ncall)); |  | ||||||
|  |  | ||||||
| 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; |  | ||||||
|  |  | ||||||
| 	time_statistics timestat; |  | ||||||
| 	std::vector<double> t_time(ncall); |  | ||||||
| 	for(uint64_t i=0;i<ncall;i++){ |  | ||||||
| 	  t0=usecond(); |  | ||||||
| 	  Ds.DhopEO(src_o,r_e,DaggerNo); |  | ||||||
| 	  t1=usecond(); |  | ||||||
| 	  t_time[i] = t1-t0; |  | ||||||
| 	} |  | ||||||
| 	FGrid->Barrier(); |  | ||||||
| 	 |  | ||||||
| 	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |  | ||||||
| 	double flops=(1146.0*volume)/2; |  | ||||||
| 	double mf_hi, mf_lo, mf_err; |  | ||||||
| 	 |  | ||||||
| 	timestat.statistics(t_time); |  | ||||||
| 	mf_hi = flops/timestat.min; |  | ||||||
| 	mf_lo = flops/timestat.max; |  | ||||||
| 	mf_err= flops/timestat.min * timestat.err/timestat.mean; |  | ||||||
|  |  | ||||||
| 	mflops = flops/timestat.mean; |  | ||||||
| 	mflops_all.push_back(mflops); |  | ||||||
| 	if ( mflops_best == 0   ) mflops_best = mflops; |  | ||||||
| 	if ( mflops_worst== 0   ) mflops_worst= mflops; |  | ||||||
| 	if ( mflops>mflops_best ) mflops_best = mflops; |  | ||||||
| 	if ( mflops<mflops_worst) mflops_worst= mflops; |  | ||||||
| 	 |  | ||||||
| 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl; |  | ||||||
| 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl; |  | ||||||
| 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl; |  | ||||||
|        |  | ||||||
|       } |  | ||||||
|  |  | ||||||
|       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|       std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl; |  | ||||||
|       std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl; |  | ||||||
|       std::cout<<GridLogMessage <<fmt << std::endl; |  | ||||||
|       std::cout<<GridLogMessage ; |  | ||||||
|  |  | ||||||
|       for(int i=0;i<mflops_all.size();i++){ |  | ||||||
| 	std::cout<<mflops_all[i]/NN<<" ; " ; |  | ||||||
|       } |  | ||||||
|       std::cout<<std::endl; |  | ||||||
|     } |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     return mflops_best; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   static double Clover(int L) |  | ||||||
|   { |  | ||||||
|     double mflops; |  | ||||||
|     double mflops_best = 0; |  | ||||||
|     double mflops_worst= 0; |  | ||||||
|     std::vector<double> mflops_all; |  | ||||||
|  |  | ||||||
|     /////////////////////////////////////////////////////// |  | ||||||
|     // Set/Get the layout & grid size |  | ||||||
|     /////////////////////////////////////////////////////// |  | ||||||
|     int threads = GridThread::GetThreads(); |  | ||||||
|     Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); |  | ||||||
|     Coordinate local({L,L,L,L}); |  | ||||||
|     Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); |  | ||||||
|      |  | ||||||
|     GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, |  | ||||||
| 								       GridDefaultSimd(Nd,vComplex::Nsimd()), |  | ||||||
| 								       GridDefaultMpi()); |  | ||||||
|     uint64_t NP = TmpGrid->RankCount(); |  | ||||||
|     uint64_t NN = TmpGrid->NodeCount(); |  | ||||||
|     NN_global=NN; |  | ||||||
|     uint64_t SHM=NP/NN; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     ///////// Welcome message //////////// |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "Benchmark Clover on "<<L<<"^4 local volume "<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* ranks          : "<<NP  <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* nodes          : "<<NN  <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* ranks/node     : "<<SHM <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* ranks geom     : "<<GridCmdVectorIntToString(mpi)<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|  |  | ||||||
|     ///////// Lattice Init //////////// |  | ||||||
|     GridCartesian         * FGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); |  | ||||||
|     GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid); |  | ||||||
|      |  | ||||||
|     ///////// RNG Init //////////// |  | ||||||
|     std::vector<int> seeds4({1,2,3,4}); |  | ||||||
|     GridParallelRNG          RNG4(FGrid);  RNG4.SeedFixedIntegers(seeds4); |  | ||||||
|     std::cout << GridLogMessage << "Initialised RNGs" << std::endl; |  | ||||||
|  |  | ||||||
|     RealD mass=0.1; |  | ||||||
|     RealD csw=1.0; |  | ||||||
|  |  | ||||||
|     typedef WilsonCloverFermionF Action; |  | ||||||
|     typedef typename Action::FermionField Fermion;  |  | ||||||
|     typedef LatticeGaugeFieldF Gauge; |  | ||||||
|      |  | ||||||
|     Gauge Umu(FGrid);  SU<Nc>::HotConfiguration(RNG4,Umu);  |  | ||||||
|  |  | ||||||
|     Action Dc(Umu,*FGrid,*FrbGrid,mass,csw,csw); |  | ||||||
|  |  | ||||||
|     ///////// Source preparation //////////// |  | ||||||
|     Fermion src   (FGrid); random(RNG4,src); |  | ||||||
|     Fermion r     (FGrid); |  | ||||||
|    |  | ||||||
|     { |  | ||||||
|  |  | ||||||
|       const int num_cases = 1; |  | ||||||
|       std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S "); |  | ||||||
|        |  | ||||||
|       controls Cases [] = { |  | ||||||
| 	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  }, |  | ||||||
|       };  |  | ||||||
|  |  | ||||||
|       for(int c=0;c<num_cases;c++) { |  | ||||||
| 	 |  | ||||||
| 	WilsonKernelsStatic::Comms = Cases[c].CommsOverlap; |  | ||||||
| 	WilsonKernelsStatic::Opt   = Cases[c].Opt; |  | ||||||
| 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); |  | ||||||
|        |  | ||||||
| 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
| 	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; |  | ||||||
| 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
| 	 |  | ||||||
| 	int nwarm = 10; |  | ||||||
| 	double t0=usecond(); |  | ||||||
| 	FGrid->Barrier(); |  | ||||||
| 	for(int i=0;i<nwarm;i++){ |  | ||||||
| 	  Dc.M(src,r); |  | ||||||
| 	} |  | ||||||
| 	FGrid->Barrier(); |  | ||||||
| 	double t1=usecond(); |  | ||||||
| 	uint64_t ncall = 500; |  | ||||||
|  |  | ||||||
| 	FGrid->Broadcast(0,&ncall,sizeof(ncall)); |  | ||||||
|  |  | ||||||
| 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; |  | ||||||
|  |  | ||||||
| 	time_statistics timestat; |  | ||||||
| 	std::vector<double> t_time(ncall); |  | ||||||
| 	for(uint64_t i=0;i<ncall;i++){ |  | ||||||
| 	  t0=usecond(); |  | ||||||
| 	  Dc.M(src,r); |  | ||||||
| 	  t1=usecond(); |  | ||||||
| 	  t_time[i] = t1-t0; |  | ||||||
| 	} |  | ||||||
| 	FGrid->Barrier(); |  | ||||||
| 	 |  | ||||||
| 	double volume=1;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; |  | ||||||
| 	double flops=(1344+ 24+6*6*8*2)*volume; |  | ||||||
| 	double mf_hi, mf_lo, mf_err; |  | ||||||
| 	 |  | ||||||
| 	timestat.statistics(t_time); |  | ||||||
| 	mf_hi = flops/timestat.min; |  | ||||||
| 	mf_lo = flops/timestat.max; |  | ||||||
| 	mf_err= flops/timestat.min * timestat.err/timestat.mean; |  | ||||||
|  |  | ||||||
| 	mflops = flops/timestat.mean; |  | ||||||
| 	mflops_all.push_back(mflops); |  | ||||||
| 	if ( mflops_best == 0   ) mflops_best = mflops; |  | ||||||
| 	if ( mflops_worst== 0   ) mflops_worst= mflops; |  | ||||||
| 	if ( mflops>mflops_best ) mflops_best = mflops; |  | ||||||
| 	if ( mflops<mflops_worst) mflops_worst= mflops; |  | ||||||
| 	 |  | ||||||
| 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl; |  | ||||||
| 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank   "<< mflops/NP<<std::endl; |  | ||||||
| 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node   "<< mflops/NN<<std::endl; |  | ||||||
|        |  | ||||||
|       } |  | ||||||
|  |  | ||||||
|       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|       std::cout<<GridLogMessage << L<<"^4  Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl; |  | ||||||
|       std::cout<<GridLogMessage << L<<"^4  Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl; |  | ||||||
|       std::cout<<GridLogMessage <<fmt << std::endl; |  | ||||||
|       std::cout<<GridLogMessage ; |  | ||||||
|  |  | ||||||
|       for(int i=0;i<mflops_all.size();i++){ |  | ||||||
| 	std::cout<<mflops_all[i]/NN<<" ; " ; |  | ||||||
|       } |  | ||||||
|       std::cout<<std::endl; |  | ||||||
|     } |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     return mflops_best; |  | ||||||
|   } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| int main (int argc, char ** argv) |  | ||||||
| { |  | ||||||
|   Grid_init(&argc,&argv); |  | ||||||
|  |  | ||||||
|   if (GlobalSharedMemory::WorldRank==0) {  |  | ||||||
|     FP = fopen("Benchmark_usqcd.csv","w"); |  | ||||||
|   } else { |  | ||||||
|     FP = fopen("/dev/null","w"); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); |  | ||||||
|   LebesgueOrder::Block = std::vector<int>({2,2,2,2}); |  | ||||||
|  |  | ||||||
|   Benchmark::Decomposition(); |  | ||||||
|  |  | ||||||
|   int do_su4=0; |  | ||||||
|   int do_memory=1; |  | ||||||
|   int do_comms =1; |  | ||||||
|   int do_blas  =1; |  | ||||||
|  |  | ||||||
|   int sel=4; |  | ||||||
|   std::vector<int> L_list({8,12,16,24,32}); |  | ||||||
|   int selm1=sel-1; |  | ||||||
|  |  | ||||||
|   std::vector<double> clover; |  | ||||||
|   std::vector<double> dwf4; |  | ||||||
|   std::vector<double> staggered; |  | ||||||
|  |  | ||||||
|   int Ls=1; |  | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|   for(int l=0;l<L_list.size();l++){ |  | ||||||
|     clover.push_back(Benchmark::DWF(1,L_list[l])); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   Ls=12; |  | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|   for(int l=0;l<L_list.size();l++){ |  | ||||||
|     double result = Benchmark::DWF(Ls,L_list[l]) ; |  | ||||||
|     dwf4.push_back(result); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|   for(int l=0;l<L_list.size();l++){ |  | ||||||
|     double result = Benchmark::Staggered(L_list[l]) ; |  | ||||||
|     staggered.push_back(result); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|   std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered" <<std::endl; |  | ||||||
|   for(int l=0;l<L_list.size();l++){ |  | ||||||
|     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl; |  | ||||||
|   } |  | ||||||
|   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|  |  | ||||||
|   int NN=NN_global; |  | ||||||
|   if ( do_memory ) { |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << " Memory benchmark " <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     Benchmark::Memory(); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   if ( do_blas ) { |  | ||||||
| #if defined(GRID_CUDA) || defined(GRID_HIP)     || defined(GRID_SYCL)    |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     Benchmark::BLAS(); |  | ||||||
| #endif |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   if ( do_su4 ) { |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     Benchmark::SU4(); |  | ||||||
|   } |  | ||||||
|    |  | ||||||
|   if ( do_comms ) { |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << " Communications benchmark " <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     Benchmark::Comms(); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl; |  | ||||||
|     fprintf(FP,"Per node summary table\n"); |  | ||||||
|     fprintf(FP,"\n"); |  | ||||||
|     fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n"); |  | ||||||
|     fprintf(FP,"\n"); |  | ||||||
|     for(int l=0;l<L_list.size();l++){ |  | ||||||
|       std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl; |  | ||||||
|       fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.); |  | ||||||
|     } |  | ||||||
|     fprintf(FP,"\n"); |  | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl; |  | ||||||
|     std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl; |  | ||||||
|     std::cout<<std::setprecision(3); |  | ||||||
|     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; |  | ||||||
|  |  | ||||||
|   Grid_finalize(); |  | ||||||
|   fclose(FP); |  | ||||||
| } |  | ||||||
							
								
								
									
										15
									
								
								bootstrap.sh
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								bootstrap.sh
									
									
									
									
									
								
							| @@ -1,12 +1,12 @@ | |||||||
| #!/usr/bin/env bash | #!/usr/bin/env bash | ||||||
| set -e | set -e | ||||||
|  |  | ||||||
| EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.bz2' | EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2' | ||||||
| EIGEN_SHA256SUM='b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626' | EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11' | ||||||
|  |  | ||||||
|  |  | ||||||
| echo "-- deploying Eigen source..." | echo "-- deploying Eigen source..." | ||||||
| ARC=$(basename ${EIGEN_URL}) | ARC=`basename ${EIGEN_URL}` | ||||||
| wget ${EIGEN_URL} --no-check-certificate | wget ${EIGEN_URL} --no-check-certificate | ||||||
| if command -v sha256sum; then | if command -v sha256sum; then | ||||||
|    echo "$EIGEN_SHA256SUM  $(basename "$EIGEN_URL")" \ |    echo "$EIGEN_SHA256SUM  $(basename "$EIGEN_URL")" \ | ||||||
| @@ -14,8 +14,13 @@ if command -v sha256sum; then | |||||||
| else | else | ||||||
|    echo "WARNING: could not verify checksum, please install sha256sum" >&2 |    echo "WARNING: could not verify checksum, please install sha256sum" >&2 | ||||||
| fi | fi | ||||||
| ./scripts/update_eigen.sh "${ARC}" | ./scripts/update_eigen.sh ${ARC} | ||||||
| rm "${ARC}" | rm ${ARC} | ||||||
|  | # patch for non-portable includes in Eigen 3.3.5 | ||||||
|  | # apparently already fixed in Eigen HEAD so it should not be  | ||||||
|  | # a problem in the future (A.P.) | ||||||
|  | patch Eigen/unsupported/Eigen/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch | ||||||
|  |  | ||||||
| echo '-- generating Make.inc files...' | echo '-- generating Make.inc files...' | ||||||
| ./scripts/filelist | ./scripts/filelist | ||||||
| echo '-- generating configure script...' | echo '-- generating configure script...' | ||||||
|   | |||||||
							
								
								
									
										21
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								configure.ac
									
									
									
									
									
								
							| @@ -226,14 +226,23 @@ case ${ac_SFW_FP16} in | |||||||
| esac | esac | ||||||
|  |  | ||||||
| ############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons | ############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons | ||||||
| AC_ARG_ENABLE([accelerator-aware-mpi], | AC_ARG_ENABLE([accelerator-cshift], | ||||||
|     [AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])], |     [AS_HELP_STRING([--enable-accelerator-cshift=yes|no],[run cshift on the device])], | ||||||
|     [ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes]) |     [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes]) | ||||||
|  |  | ||||||
| case ${ac_ACCELERATOR_AWARE_MPI} in | AC_ARG_ENABLE([ucx-buggy], | ||||||
|  |     [AS_HELP_STRING([--enable-ucx-buggy=yes|no],[enable workaround for UCX device buffer bugs])], | ||||||
|  |     [ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no]) | ||||||
|  |  | ||||||
|  | case ${ac_UCXBUGGY} in | ||||||
|     yes) |     yes) | ||||||
|       AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host]) |     ac_ACC_CSHIFT=no;; | ||||||
|       AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);; |     *);; | ||||||
|  | esac | ||||||
|  |  | ||||||
|  | case ${ac_ACC_CSHIFT} in | ||||||
|  |     yes) | ||||||
|  |       AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);; | ||||||
|     *);; |     *);; | ||||||
| esac | esac | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,183 +0,0 @@ | |||||||
| /*  |  | ||||||
|  * Example_plaquette.cc                                                                |  | ||||||
|  *  |  | ||||||
|  * D. Clarke  |  | ||||||
|  *  |  | ||||||
|  * Here I just want to create an incredibly simple main to get started with GRID and get used |  | ||||||
|  * to its syntax. If the reader is like me, they vaguely understand something about lattice coding, |  | ||||||
|  * they don't know a ton of C++, don't know much of the fine details, and certainly know nothing about GRID. |  | ||||||
|  * |  | ||||||
|  * Once you've made a new executable, like this one, you can bootstrap.sh again. At this point, |  | ||||||
|  * the code should be able to find your new executable. You can tell that bootstrap.sh worked by |  | ||||||
|  * having a look at Make.inc. You should see your executable inside there. |  | ||||||
|  * |  | ||||||
|  * Warning: This code illustrative only, not well tested, and not meant for production use. The best |  | ||||||
|  * way to read this code is to start at the main. |  | ||||||
|  *  |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| // All your mains should have this |  | ||||||
| #include <Grid/Grid.h> |  | ||||||
| using namespace Grid; |  | ||||||
|  |  | ||||||
|  |  | ||||||
| // This copies what already exists in WilsonLoops.h. The point here is to be pedagogical and explain in |  | ||||||
| // detail what everything does so we can see how GRID works. |  | ||||||
| template <class Gimpl> class WLoops : public Gimpl { |  | ||||||
| public: |  | ||||||
|     // Gimpl seems to be an arbitrary class. Within this class, it is expected that certain types are |  | ||||||
|     // already defined, things like Scalar and Field. This macro includes a bunch of #typedefs that |  | ||||||
|     // implement this equivalence at compile time. |  | ||||||
|     INHERIT_GIMPL_TYPES(Gimpl); |  | ||||||
|  |  | ||||||
|     // Some example Gimpls can be found in GaugeImplementations.h, at the bottom. These are in turn built |  | ||||||
|     // out of GaugeImplTypes, which can be found in GaugeImplTypes.h. The GaugeImplTypes contain the base |  | ||||||
|     // field/vector/link/whatever types. These inherit from iScalar, iVector, and iMatrix objects, which |  | ||||||
|     // are sort of the building blocks for gerenal math objects. The "i" at the beginning of these names |  | ||||||
|     // indicates that they should be for internal use only. It seems like these base types have the |  | ||||||
|     // acceleration, e.g. SIMD or GPU or what-have-you, abstracted away. How you accelerate these things |  | ||||||
|     // appears to be controlled through a template parameter called vtype. |  | ||||||
|  |  | ||||||
|     // The general math/physics objects, such as a color matrix, are built up by nesting these objects. |  | ||||||
|     // For instance a general color matrix has two color indices, so it's built up like |  | ||||||
|     //     iScalar<iScalar<iMatrix<vtype ... |  | ||||||
|     // where the levels going from the inside out are color, spin, then Lorentz indices. Scalars have |  | ||||||
|     // no indices, so it's what we use when such an index isn't needed. Lattice objects are made by one |  | ||||||
|     // higher level of indexing using iVector. |  | ||||||
|  |  | ||||||
|     // These types will be used for U and U_mu objects, respectively. |  | ||||||
|     typedef typename Gimpl::GaugeLinkField GaugeMat; |  | ||||||
|     typedef typename Gimpl::GaugeField GaugeLorentz; |  | ||||||
|  |  | ||||||
|     // U_mu_nu(x) |  | ||||||
|     static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U, const int mu, const int nu) { |  | ||||||
|         // Calls like CovShiftForward and CovShiftBackward have 3 arguments, and they multiply together |  | ||||||
|         // the first and last argument. (Second arg gives the shift direction.) The CovShiftIdentityBackward |  | ||||||
|         // has meanwhile only two arguments; it just returns the shifted (adjoint since backward) link.  |  | ||||||
|         plaq = Gimpl::CovShiftForward(U[mu],mu, |  | ||||||
|                    // Means Link*Cshift(field,mu,1), arguments are Link, mu, field in that order. |  | ||||||
|                    Gimpl::CovShiftForward(U[nu],nu, |  | ||||||
|                        Gimpl::CovShiftBackward(U[mu],mu, |  | ||||||
|                            // This means Cshift(adj(Link), mu, -1) |  | ||||||
|                            Gimpl::CovShiftIdentityBackward(U[nu], nu)))); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // tr U_mu_nu(x) |  | ||||||
|     static void traceDirPlaquette(ComplexField &plaq, const std::vector<GaugeMat> &U, const int mu, const int nu) { |  | ||||||
|         // This .Grid() syntax seems to get the pointer to the GridBase. Apparently this is needed as argument |  | ||||||
|         // to instantiate a Lattice object. |  | ||||||
|         GaugeMat sp(U[0].Grid()); |  | ||||||
|         dirPlaquette(sp, U, mu, nu); |  | ||||||
|         plaq = trace(sp); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // sum_mu_nu tr U_mu_nu(x) |  | ||||||
|     static void sitePlaquette(ComplexField &Plaq, const std::vector<GaugeMat> &U) { |  | ||||||
|         ComplexField sitePlaq(U[0].Grid()); |  | ||||||
|         Plaq = Zero(); |  | ||||||
|         // Nd=4 and Nc=3 are set as global constants in QCD.h |  | ||||||
|         for (int mu = 1; mu < Nd; mu++) { |  | ||||||
|             for (int nu = 0; nu < mu; nu++) { |  | ||||||
|                 traceDirPlaquette(sitePlaq, U, mu, nu); |  | ||||||
|                 Plaq = Plaq + sitePlaq; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // sum_mu_nu_x Re tr U_mu_nu(x) |  | ||||||
|     static RealD sumPlaquette(const GaugeLorentz &Umu) { |  | ||||||
|         std::vector<GaugeMat> U(Nd, Umu.Grid()); |  | ||||||
|         for (int mu = 0; mu < Nd; mu++) { |  | ||||||
|             // Umu is a GaugeLorentz object, and as such has a non-trivial Lorentz index. We can |  | ||||||
|             // access the element in the mu Lorentz index with this PeekIndex syntax. |  | ||||||
|             U[mu] = PeekIndex<LorentzIndex>(Umu, mu); |  | ||||||
|         } |  | ||||||
|         ComplexField Plaq(Umu.Grid()); |  | ||||||
|         sitePlaquette(Plaq, U); |  | ||||||
|         // I guess this should be the line that sums over all space-time sites. |  | ||||||
|         auto Tp = sum(Plaq); |  | ||||||
|         // Until now, we have been working with objects inside the tensor nest. This TensorRemove gets |  | ||||||
|         // rid of the tensor nest to return whatever is inside. |  | ||||||
|         auto p  = TensorRemove(Tp); |  | ||||||
|         return p.real(); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // < Re tr U_mu_nu(x) > |  | ||||||
|     static RealD avgPlaquette(const GaugeLorentz &Umu) { |  | ||||||
|         // Real double type |  | ||||||
|         RealD sumplaq = sumPlaquette(Umu); |  | ||||||
|         // gSites() is the number of global sites. there is also lSites() for local sites. |  | ||||||
|         double vol = Umu.Grid()->gSites(); |  | ||||||
|         // The number of orientations. 4*3/2=6 for Nd=4, as known. |  | ||||||
|         double faces = (1.0 * Nd * (Nd - 1)) / 2.0; |  | ||||||
|         return sumplaq / vol / faces / Nc; |  | ||||||
|     } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
| // Next we show an example of how to construct an input parameter class. We first inherit |  | ||||||
| // from Serializable. Then all class data members have to be defined using the |  | ||||||
| // GRID_SERIALIZABLE_CLASS_MEMBERS macro. This variadic macro allows for arbitrarily many |  | ||||||
| // class data members. In the below case, we make a parameter file holding the configuration |  | ||||||
| // name. Here, it expects the name to be labeled with "conf_name" in the configuration file.  |  | ||||||
| struct ConfParameters: Serializable { |  | ||||||
|     GRID_SERIALIZABLE_CLASS_MEMBERS( |  | ||||||
|         ConfParameters, |  | ||||||
|         std::string, conf_name); |  | ||||||
|  |  | ||||||
|     template <class ReaderClass> |  | ||||||
|     ConfParameters(Reader<ReaderClass>& Reader){ |  | ||||||
|         // If we are reading an XML file, it should be structured like: |  | ||||||
|         // <grid> |  | ||||||
|         //   <parameters> |  | ||||||
|         //     <conf_name>l20t20b06498a_nersc.302500</conf_name> |  | ||||||
|         //   </parameters> |  | ||||||
|         // </grid> |  | ||||||
|         read(Reader, "parameters", *this); |  | ||||||
|     } |  | ||||||
| }; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| // This syntax lets you pass command line arguments to main. An asterisk means that what follows is |  | ||||||
| // a pointer. Two asterisks means what follows is a pointer to an array.  |  | ||||||
| int main (int argc, char **argv) |  | ||||||
| { |  | ||||||
|     // This initializes Grid. Some command line options include |  | ||||||
|     //   --mpi n.n.n.n |  | ||||||
|     //   --threads n |  | ||||||
|     //   --grid n.n.n.n |  | ||||||
|     Grid_init(&argc, &argv); |  | ||||||
|  |  | ||||||
|     // This is where you would specify a custom lattice size, if not from the command line. Here |  | ||||||
|     // Nd is a global quantity that is currently set to 4. |  | ||||||
|     Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); |  | ||||||
|     Coordinate mpi_layout  = GridDefaultMpi(); |  | ||||||
|     Coordinate latt_size   = GridDefaultLatt(); |  | ||||||
|  |  | ||||||
|     // Instantiate the spacetime Grid on which everything will be built. |  | ||||||
|     GridCartesian GRID(latt_size,simd_layout,mpi_layout); |  | ||||||
|  |  | ||||||
|     // The PeriodicGimplD type is what you want for gauge matrices. There is also a LatticeGaugeFieldD |  | ||||||
|     // type that you can use, which will work perfectly with what follows.  |  | ||||||
|     PeriodicGimplD::Field U(&GRID); |  | ||||||
|  |  | ||||||
|     // Here we read in the parameter file params.json to get conf_name. The last argument is what the |  | ||||||
|     // top organizational level is called in the param file.  |  | ||||||
|     XmlReader Reader("Example_plaquette.xml",false, "grid"); |  | ||||||
|     ConfParameters param(Reader);   |  | ||||||
|  |  | ||||||
|     // Load a lattice from SIMULATeQCD into U. SIMULATeQCD finds plaquette = 0.6381995717 |  | ||||||
|     FieldMetaData header; |  | ||||||
|     NerscIO::readConfiguration(U, header, param.conf_name); |  | ||||||
|  |  | ||||||
|     // Let's see what we find. |  | ||||||
|     RealD plaq = WLoops<PeriodicGimplD>::avgPlaquette(U); |  | ||||||
|  |  | ||||||
|     // This is how you make log messages. |  | ||||||
|     std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1) << "Plaquette = " << plaq << std::endl; |  | ||||||
|  |  | ||||||
|     // To wrap things up. |  | ||||||
|     Grid_finalize(); |  | ||||||
| } |  | ||||||
							
								
								
									
										19
									
								
								scripts/eigen-3.3.5.Tensor.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								scripts/eigen-3.3.5.Tensor.patch
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,19 @@ | |||||||
|  | --- ./Eigen/unsupported/Eigen/CXX11/Tensor	2018-07-23 10:33:42.000000000 +0100 | ||||||
|  | +++ Tensor	2018-08-28 16:15:56.000000000 +0100 | ||||||
|  | @@ -25,7 +25,7 @@ | ||||||
|  |  #include <utility> | ||||||
|  |  #endif | ||||||
|  |   | ||||||
|  | -#include <Eigen/src/Core/util/DisableStupidWarnings.h> | ||||||
|  | +#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" | ||||||
|  |   | ||||||
|  |  #include "../SpecialFunctions" | ||||||
|  |  #include "src/util/CXX11Meta.h" | ||||||
|  | @@ -147,6 +147,6 @@ | ||||||
|  |   | ||||||
|  |  #include "src/Tensor/TensorIO.h" | ||||||
|  |   | ||||||
|  | -#include <Eigen/src/Core/util/ReenableStupidWarnings.h> | ||||||
|  | +#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" | ||||||
|  |   | ||||||
|  |  //#endif // EIGEN_CXX11_TENSOR_MODULE | ||||||
| @@ -1,67 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| #PBS -q debug |  | ||||||
| #PBS -l select=1 |  | ||||||
| #PBS -l walltime=00:20:00 |  | ||||||
| #PBS -A LatticeQCD_aesp_CNDA |  | ||||||
|  |  | ||||||
| #export OMP_PROC_BIND=spread |  | ||||||
| #unset OMP_PLACES |  | ||||||
|  |  | ||||||
| cd $PBS_O_WORKDIR |  | ||||||
|  |  | ||||||
| source ../sourceme.sh |  | ||||||
| module load pti-gpu |  | ||||||
|  |  | ||||||
| #cat $PBS_NODEFILE |  | ||||||
|  |  | ||||||
| export OMP_NUM_THREADS=4 |  | ||||||
| export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 |  | ||||||
|  |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST |  | ||||||
|  |  | ||||||
| #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 |  | ||||||
| #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 |  | ||||||
| #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 |  | ||||||
| #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 |  | ||||||
| #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 |  | ||||||
| #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 |  | ||||||
| #export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 |  | ||||||
| export MPICH_OFI_NIC_POLICY=GPU |  | ||||||
|  |  | ||||||
| # 12 ppn, 2 nodes, 24 ranks |  | ||||||
| # |  | ||||||
| CMD="mpiexec -np 12 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_comms_host_device --mpi 2.2.1.3 --grid 24.32.32.24 \ |  | ||||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"  |  | ||||||
| #$CMD | tee 1node.comms |  | ||||||
|  |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 1 -ppn 1  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \ |  | ||||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 " |  | ||||||
| #$CMD | tee 1tile.dwf |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 12 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \ |  | ||||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 1node.32.32.32.48.dwf |  | ||||||
|  |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 12 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \ |  | ||||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| #$CMD | tee 1node.64.64.32.96.dwf |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 12 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \ |  | ||||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| #$CMD | tee 1node.64.32.32.48.dwf |  | ||||||
|  |  | ||||||
| @@ -1,60 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 |  | ||||||
|  |  | ||||||
| #PBS -q EarlyAppAccess |  | ||||||
| #PBS -l select=1024 |  | ||||||
| #PBS -l walltime=01:00:00 |  | ||||||
| #PBS -A LatticeQCD_aesp_CNDA |  | ||||||
|  |  | ||||||
| #export OMP_PROC_BIND=spread |  | ||||||
| #unset OMP_PLACES |  | ||||||
|  |  | ||||||
| cd $PBS_O_WORKDIR |  | ||||||
|  |  | ||||||
| source ../sourceme.sh |  | ||||||
|  |  | ||||||
| cat $PBS_NODEFILE |  | ||||||
|  |  | ||||||
| export OMP_NUM_THREADS=3 |  | ||||||
| export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 |  | ||||||
|  |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST |  | ||||||
|  |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 |  | ||||||
| #export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 |  | ||||||
| export MPICH_OFI_NIC_POLICY=GPU |  | ||||||
| export FI_CXI_CQ_FILL_PERCENT=10 |  | ||||||
| export FI_CXI_DEFAULT_CQ_SIZE=262144 |  | ||||||
| #export FI_CXI_DEFAULT_CQ_SIZE=131072 |  | ||||||
| #export FI_CXI_CQ_FILL_PERCENT=20 |  | ||||||
|  |  | ||||||
| # 12 ppn, 32 nodes, 384 ranks |  | ||||||
| # |  | ||||||
| CMD="mpiexec -np 12288 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_comms_host_device --mpi 8.6.16.16 --grid 64.48.64.284 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" |  | ||||||
|  |  | ||||||
| $CMD  |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 12288 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 1024node.dwf.small.cq |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 12288 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 1024node.dwf.cq |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -1,55 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| #PBS -q workq |  | ||||||
| #PBS -l select=2 |  | ||||||
| #PBS -l walltime=00:20:00 |  | ||||||
| #PBS -A LatticeQCD_aesp_CNDA |  | ||||||
|  |  | ||||||
| #export OMP_PROC_BIND=spread |  | ||||||
| #unset OMP_PLACES |  | ||||||
|  |  | ||||||
| cd $PBS_O_WORKDIR |  | ||||||
|  |  | ||||||
| source ../sourceme.sh |  | ||||||
| module load pti-gpu |  | ||||||
|  |  | ||||||
| #cat $PBS_NODEFILE |  | ||||||
|  |  | ||||||
| export OMP_NUM_THREADS=4 |  | ||||||
| export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 |  | ||||||
|  |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST |  | ||||||
|  |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 |  | ||||||
| export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 |  | ||||||
| export MPICH_OFI_NIC_POLICY=GPU |  | ||||||
|  |  | ||||||
| # 12 ppn, 2 nodes, 24 ranks |  | ||||||
| # |  | ||||||
| CMD="mpiexec -np 24 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \ |  | ||||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"  |  | ||||||
| $CMD | tee 2node.comms |  | ||||||
|  |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 24 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \ |  | ||||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 2node.32.32.64.48.dwf |  | ||||||
|  |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 24 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \ |  | ||||||
| 		--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 2node.64.64.64.96.dwf |  | ||||||
|  |  | ||||||
| @@ -1,56 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 |  | ||||||
|  |  | ||||||
| #PBS -q EarlyAppAccess |  | ||||||
| #PBS -l select=2048 |  | ||||||
| #PBS -l walltime=01:00:00 |  | ||||||
| #PBS -A LatticeQCD_aesp_CNDA |  | ||||||
|  |  | ||||||
| #export OMP_PROC_BIND=spread |  | ||||||
| #unset OMP_PLACES |  | ||||||
|  |  | ||||||
| cd $PBS_O_WORKDIR |  | ||||||
|  |  | ||||||
| source ../sourceme.sh |  | ||||||
|  |  | ||||||
| cat $PBS_NODEFILE |  | ||||||
|  |  | ||||||
| export OMP_NUM_THREADS=3 |  | ||||||
| export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 |  | ||||||
|  |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST |  | ||||||
|  |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 |  | ||||||
| export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 |  | ||||||
| export MPICH_OFI_NIC_POLICY=GPU |  | ||||||
|  |  | ||||||
| # 12 ppn, 32 nodes, 384 ranks |  | ||||||
| # |  | ||||||
| CMD="mpiexec -np 24576 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_comms_host_device --mpi 8.12.16.16 --grid 64.48.64.284 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" |  | ||||||
|  |  | ||||||
| $CMD  |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 24576 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 128.128.128.384 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 2048node.dwf.small |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 24576 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 16.8.8.24 --grid 256.256.256.768 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 2048node.dwf |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -1,48 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 |  | ||||||
|  |  | ||||||
| #PBS -q EarlyAppAccess |  | ||||||
| #PBS -l select=256 |  | ||||||
| #PBS -l walltime=01:00:00 |  | ||||||
| #PBS -A LatticeQCD_aesp_CNDA |  | ||||||
|  |  | ||||||
| #export OMP_PROC_BIND=spread |  | ||||||
| #unset OMP_PLACES |  | ||||||
|  |  | ||||||
| cd $PBS_O_WORKDIR |  | ||||||
|  |  | ||||||
| source ../sourceme.sh |  | ||||||
|  |  | ||||||
| cat $PBS_NODEFILE |  | ||||||
|  |  | ||||||
| export OMP_NUM_THREADS=3 |  | ||||||
| export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 |  | ||||||
|  |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST |  | ||||||
|  |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 |  | ||||||
| export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 |  | ||||||
| export MPICH_OFI_NIC_POLICY=GPU |  | ||||||
|  |  | ||||||
| # 12 ppn, 32 nodes, 384 ranks |  | ||||||
| # |  | ||||||
| CMD="mpiexec -np 3072 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_comms_host_device --mpi 8.6.8.8 --grid 32.24.32.192 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" |  | ||||||
|  |  | ||||||
| $CMD  |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 3072 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 8.8.4.12 --grid 128.128.128.768 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 256node.dwf.large |  | ||||||
| @@ -1,48 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 |  | ||||||
|  |  | ||||||
| #PBS -q EarlyAppAccess |  | ||||||
| #PBS -l select=512 |  | ||||||
| #PBS -l walltime=01:00:00 |  | ||||||
| #PBS -A LatticeQCD_aesp_CNDA |  | ||||||
|  |  | ||||||
| #export OMP_PROC_BIND=spread |  | ||||||
| #unset OMP_PLACES |  | ||||||
|  |  | ||||||
| cd $PBS_O_WORKDIR |  | ||||||
|  |  | ||||||
| source ../sourceme.sh |  | ||||||
|  |  | ||||||
| cat $PBS_NODEFILE |  | ||||||
|  |  | ||||||
| export OMP_NUM_THREADS=3 |  | ||||||
| export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 |  | ||||||
|  |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST |  | ||||||
|  |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 |  | ||||||
| export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 |  | ||||||
| export MPICH_OFI_NIC_POLICY=GPU |  | ||||||
|  |  | ||||||
| # 12 ppn, 32 nodes, 384 ranks |  | ||||||
| # |  | ||||||
| CMD="mpiexec -np 6144 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_comms_host_device --mpi 8.6.8.16 --grid 32.24.32.192 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" |  | ||||||
|  |  | ||||||
| $CMD  |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 6144 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 8.8.8.12 --grid 256.128.128.768 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 512node.dwf.large |  | ||||||
| @@ -1,80 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 |  | ||||||
|  |  | ||||||
| #PBS -q EarlyAppAccess |  | ||||||
| #PBS -l select=32 |  | ||||||
| #PBS -l walltime=01:00:00 |  | ||||||
| #PBS -A LatticeQCD_aesp_CNDA |  | ||||||
|  |  | ||||||
| #export OMP_PROC_BIND=spread |  | ||||||
| #unset OMP_PLACES |  | ||||||
|  |  | ||||||
| cd $PBS_O_WORKDIR |  | ||||||
|  |  | ||||||
| source ../sourceme.sh |  | ||||||
|  |  | ||||||
| cat $PBS_NODEFILE |  | ||||||
|  |  | ||||||
| export OMP_NUM_THREADS=3 |  | ||||||
| export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 |  | ||||||
|  |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST |  | ||||||
|  |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 |  | ||||||
| export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 |  | ||||||
| export MPICH_OFI_NIC_POLICY=GPU |  | ||||||
|  |  | ||||||
| # 12 ppn, 32 nodes, 384 ranks |  | ||||||
| # |  | ||||||
| CMD="mpiexec -np 384 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_comms_host_device --mpi 4.6.4.4 --grid 32.24.32.192 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32" |  | ||||||
|  |  | ||||||
| $CMD  |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 12 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 1.2.2.3 --grid 16.64.64.96 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 1node.dwf |  | ||||||
|  |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 24 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.64.64.96 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 2node.dwf |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 48 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.2.2.6 --grid 32.64.64.192 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 4node.dwf |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 96 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.2.4.6 --grid 32.64.128.192 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 8node.dwf |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 192 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 2.4.4.6 --grid 32.128.128.192 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 16node.dwf |  | ||||||
|  |  | ||||||
|  |  | ||||||
| CMD="mpiexec -np 384 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Benchmark_dwf_fp32 --mpi 4.4.4.6 --grid 64.128.128.192 \ |  | ||||||
| 		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap" |  | ||||||
| $CMD | tee 32node.dwf |  | ||||||
| @@ -1,34 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| #export NUMA_MAP=(2 2 2 3 3 3 2 2 2 3 3 3 ) |  | ||||||
| #export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1); |  | ||||||
| #export  GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1) |  | ||||||
|  |  | ||||||
| export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 ); |  | ||||||
| export  GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 ) |  | ||||||
|  |  | ||||||
| export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} |  | ||||||
| export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} |  | ||||||
|    |  | ||||||
| unset EnableWalkerPartition |  | ||||||
| export EnableImplicitScaling=0 |  | ||||||
| export ZE_AFFINITY_MASK=$gpu_id |  | ||||||
| export ONEAPI_DEVICE_FILTER=gpu,level_zero |  | ||||||
|  |  | ||||||
| export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 |  | ||||||
| export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 |  | ||||||
| export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5 |  | ||||||
| #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 |  | ||||||
| export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 |  | ||||||
| #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 |  | ||||||
|  |  | ||||||
| echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA " |  | ||||||
|  |  | ||||||
| if [ $PALS_RANKID = "0" ] |  | ||||||
| then |  | ||||||
| #    numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline  "$@" |  | ||||||
| #    numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@" |  | ||||||
|     numactl -m $NUMA -N $NUMA  "$@" |  | ||||||
| else  |  | ||||||
|     numactl -m $NUMA -N $NUMA  "$@" |  | ||||||
| fi |  | ||||||
| @@ -1,29 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| export  NUMA_MAP=(2 2 3 3  2 2  3 3  ) |  | ||||||
| export  PROC_MAP=(0 0 1 1  0 0  1 1  ) |  | ||||||
| export  NIC_MAP=(0 0  4 4  1 1  5 5  ) |  | ||||||
| export  GPU_MAP=(0 1  3 4  0 1  3 4  ) |  | ||||||
| export TILE_MAP=(0 0  0 0  1 1  1 1  ) |  | ||||||
| export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]} |  | ||||||
| export NIC=${NIC_MAP[$PALS_LOCAL_RANKID]} |  | ||||||
| export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]} |  | ||||||
| export tile_id=${TILE_MAP[$PALS_LOCAL_RANKID]} |  | ||||||
|    |  | ||||||
| #export GRID_MPICH_NIC_BIND=$NIC |  | ||||||
|  |  | ||||||
| unset EnableWalkerPartition |  | ||||||
| export EnableImplicitScaling=0 |  | ||||||
| export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 |  | ||||||
| export ZE_AFFINITY_MASK=$gpu_id.$tile_id |  | ||||||
| #export ONEAPI_DEVICE_SELECTOR=level_zero:$gpu_id.$tile_id |  | ||||||
| export ONEAPI_DEVICE_FILTER=gpu,level_zero |  | ||||||
| export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0 |  | ||||||
| export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 |  | ||||||
| export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2 |  | ||||||
| export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1 |  | ||||||
| #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1 |  | ||||||
|  |  | ||||||
| echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NIC $GRID_MPICH_NIC_BIND ; NUMA domain $NUMA" |  | ||||||
|  |  | ||||||
| numactl -m $NUMA -N $PROC_MAP  "$@" |  | ||||||
| @@ -1,16 +0,0 @@ | |||||||
|  |  | ||||||
| ../../configure \ |  | ||||||
| 	--enable-simd=GPU \ |  | ||||||
| 	--enable-gen-simd-width=64 \ |  | ||||||
| 	--enable-comms=mpi-auto \ |  | ||||||
| 	--disable-gparity \ |  | ||||||
| 	--disable-fermion-reps \ |  | ||||||
| 	--enable-shm=nvlink \ |  | ||||||
| 	--enable-accelerator=sycl \ |  | ||||||
| 	--enable-accelerator-aware-mpi=yes\ |  | ||||||
| 	--enable-unified=no \ |  | ||||||
| 	MPICXX=mpicxx \ |  | ||||||
| 	CXX=icpx \ |  | ||||||
| 	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \ |  | ||||||
| 	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel" |  | ||||||
|  |  | ||||||
| @@ -1,9 +0,0 @@ | |||||||
| export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 |  | ||||||
| export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 |  | ||||||
| export http_proxy=http://proxy.alcf.anl.gov:3128 |  | ||||||
| export https_proxy=http://proxy.alcf.anl.gov:3128 |  | ||||||
| export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 |  | ||||||
| git config --global http.proxy http://proxy.alcf.anl.gov:3128 |  | ||||||
| module use /soft/modulefiles |  | ||||||
| module load intel_compute_runtime/release/agama-devel-682.22 |  | ||||||
|   |  | ||||||
| @@ -1,2 +0,0 @@ | |||||||
| module load oneapi/eng-compiler/2023.05.15.003 |  | ||||||
| module load mpich/51.2/icc-all-deterministic-pmix-gpu |  | ||||||
| @@ -1,28 +0,0 @@ | |||||||
| #export ONEAPI_DEVICE_SELECTOR=level_zero:0.0 |  | ||||||
|  |  | ||||||
| module load oneapi/release/2023.12.15.001 |  | ||||||
|  |  | ||||||
| #module use /soft/modulefiles |  | ||||||
| #module load intel_compute_runtime/release/agama-devel-682.22 |  | ||||||
|  |  | ||||||
| export FI_CXI_DEFAULT_CQ_SIZE=131072 |  | ||||||
| export FI_CXI_CQ_FILL_PERCENT=20 |  | ||||||
|  |  | ||||||
| export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" |  | ||||||
| #export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode" |  | ||||||
|  |  | ||||||
| # |  | ||||||
| # -ftarget-register-alloc-mode=pvc:default  |  | ||||||
| # -ftarget-register-alloc-mode=pvc:small |  | ||||||
| # -ftarget-register-alloc-mode=pvc:large |  | ||||||
| # -ftarget-register-alloc-mode=pvc:auto |  | ||||||
| # |  | ||||||
|  |  | ||||||
| export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 |  | ||||||
| export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 |  | ||||||
| export http_proxy=http://proxy.alcf.anl.gov:3128 |  | ||||||
| export https_proxy=http://proxy.alcf.anl.gov:3128 |  | ||||||
| #export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 |  | ||||||
| git config --global http.proxy http://proxy.alcf.anl.gov:3128 |  | ||||||
|  |  | ||||||
| export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" |  | ||||||
| @@ -1,41 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| ## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00 |  | ||||||
|  |  | ||||||
| #PBS -q EarlyAppAccess |  | ||||||
| #PBS -l select=128 |  | ||||||
| #PBS -l walltime=02:00:00 |  | ||||||
| #PBS -A LatticeQCD_aesp_CNDA |  | ||||||
|  |  | ||||||
| #export OMP_PROC_BIND=spread |  | ||||||
| #unset OMP_PLACES |  | ||||||
|  |  | ||||||
| cd $PBS_O_WORKDIR |  | ||||||
|  |  | ||||||
| source ../sourceme.sh |  | ||||||
|  |  | ||||||
| cat $PBS_NODEFILE |  | ||||||
|  |  | ||||||
| export OMP_NUM_THREADS=3 |  | ||||||
| export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 |  | ||||||
|  |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE |  | ||||||
| #unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST |  | ||||||
|  |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0 |  | ||||||
| export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16 |  | ||||||
| export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16 |  | ||||||
| export MPICH_OFI_NIC_POLICY=GPU |  | ||||||
|  |  | ||||||
| # 12 ppn, 16 nodes, 192 ranks |  | ||||||
| # 12 ppn, 128 nodes, 1536 ranks |  | ||||||
| CMD="mpiexec -np 1536 -ppn 12  -envall \ |  | ||||||
| 	     ./gpu_tile_compact.sh \ |  | ||||||
| 	     ./Test_dwf_mixedcg_prec --mpi 4.4.4.24 --grid 128.128.128.384 \ |  | ||||||
| 		--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 7000 --comms-overlap " |  | ||||||
| $CMD  |  | ||||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user