mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Compare commits
	
		
			14 Commits
		
	
	
		
			2ae980ae43
			...
			b812a7b4c6
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| b812a7b4c6 | |||
| 891a366f73 | |||
| 10116b3be8 | |||
| a46a0f0882 | |||
| a26a8a38f4 | |||
| 7435315d50 | |||
| 9b5f741e85 | |||
| 517822fdd2 | |||
| 1b93a9be88 | |||
| 783a66b348 | |||
| 976c3e9b59 | |||
| f8ca971dae | |||
| 21bc8c24df | |||
| 30228214f7 | 
@@ -34,7 +34,7 @@
 | 
				
			|||||||
#pragma push_macro("__SYCL_DEVICE_ONLY__")
 | 
					#pragma push_macro("__SYCL_DEVICE_ONLY__")
 | 
				
			||||||
#undef __SYCL_DEVICE_ONLY__
 | 
					#undef __SYCL_DEVICE_ONLY__
 | 
				
			||||||
#define EIGEN_DONT_VECTORIZE
 | 
					#define EIGEN_DONT_VECTORIZE
 | 
				
			||||||
//#undef EIGEN_USE_SYCL
 | 
					#undef EIGEN_USE_SYCL
 | 
				
			||||||
#define __SYCL__REDEFINE__
 | 
					#define __SYCL__REDEFINE__
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
 | 
				
			|||||||
 * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
 | 
					 * Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
 | 
				
			||||||
 * type = 1 for the approximation which is infinite at x = 0. */
 | 
					 * type = 1 for the approximation which is infinite at x = 0. */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
 | 
					zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
 | 
				
			||||||
  INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
 | 
					  INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
 | 
				
			||||||
    l, invlambda, xi, xisq, *tv, s, opl;
 | 
					    l, invlambda, xi, xisq, *tv, s, opl;
 | 
				
			||||||
  int m, czero, ts;
 | 
					  int m, czero, ts;
 | 
				
			||||||
@@ -375,12 +375,12 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
 | 
				
			|||||||
  construct_partfrac(d);
 | 
					  construct_partfrac(d);
 | 
				
			||||||
  construct_contfrac(d);
 | 
					  construct_contfrac(d);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /* Converting everything to PRECISION for external use only */
 | 
					  /* Converting everything to ZOLO_PRECISION for external use only */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
 | 
					  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
 | 
				
			||||||
  zd -> A = (PRECISION) d -> A;
 | 
					  zd -> A = (ZOLO_PRECISION) d -> A;
 | 
				
			||||||
  zd -> Delta = (PRECISION) d -> Delta;
 | 
					  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
 | 
				
			||||||
  zd -> epsilon = (PRECISION) d -> epsilon;
 | 
					  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
 | 
				
			||||||
  zd -> n = d -> n;
 | 
					  zd -> n = d -> n;
 | 
				
			||||||
  zd -> type = d -> type;
 | 
					  zd -> type = d -> type;
 | 
				
			||||||
  zd -> dn = d -> dn;
 | 
					  zd -> dn = d -> dn;
 | 
				
			||||||
@@ -390,24 +390,24 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
 | 
				
			|||||||
  zd -> deg_num = d -> deg_num;
 | 
					  zd -> deg_num = d -> deg_num;
 | 
				
			||||||
  zd -> deg_denom = d -> deg_denom;
 | 
					  zd -> deg_denom = d -> deg_denom;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
 | 
					  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
 | 
				
			||||||
  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
 | 
					  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
 | 
				
			||||||
  free(d -> a);
 | 
					  free(d -> a);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
 | 
					  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
 | 
				
			||||||
  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
 | 
					  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
 | 
				
			||||||
  free(d -> ap);
 | 
					  free(d -> ap);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
 | 
					  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
 | 
				
			||||||
  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
 | 
					  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
 | 
				
			||||||
  free(d -> alpha);
 | 
					  free(d -> alpha);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
 | 
					  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
 | 
				
			||||||
  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
 | 
					  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
 | 
				
			||||||
  free(d -> beta);
 | 
					  free(d -> beta);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
 | 
					  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
 | 
				
			||||||
  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
 | 
					  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
 | 
				
			||||||
  free(d -> gamma);
 | 
					  free(d -> gamma);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  free(d);
 | 
					  free(d);
 | 
				
			||||||
@@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
zolotarev_data* higham(PRECISION epsilon, int n) {
 | 
					zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
 | 
				
			||||||
  INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
 | 
					  INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
 | 
				
			||||||
  int m, czero;
 | 
					  int m, czero;
 | 
				
			||||||
  zolotarev_data *zd;
 | 
					  zolotarev_data *zd;
 | 
				
			||||||
@@ -481,9 +481,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
 | 
				
			|||||||
  /* Converting everything to PRECISION for external use only */
 | 
					  /* Converting everything to PRECISION for external use only */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
 | 
					  zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
 | 
				
			||||||
  zd -> A = (PRECISION) d -> A;
 | 
					  zd -> A = (ZOLO_PRECISION) d -> A;
 | 
				
			||||||
  zd -> Delta = (PRECISION) d -> Delta;
 | 
					  zd -> Delta = (ZOLO_PRECISION) d -> Delta;
 | 
				
			||||||
  zd -> epsilon = (PRECISION) d -> epsilon;
 | 
					  zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
 | 
				
			||||||
  zd -> n = d -> n;
 | 
					  zd -> n = d -> n;
 | 
				
			||||||
  zd -> type = d -> type;
 | 
					  zd -> type = d -> type;
 | 
				
			||||||
  zd -> dn = d -> dn;
 | 
					  zd -> dn = d -> dn;
 | 
				
			||||||
@@ -493,24 +493,24 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
 | 
				
			|||||||
  zd -> deg_num = d -> deg_num;
 | 
					  zd -> deg_num = d -> deg_num;
 | 
				
			||||||
  zd -> deg_denom = d -> deg_denom;
 | 
					  zd -> deg_denom = d -> deg_denom;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
 | 
					  zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
 | 
				
			||||||
  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
 | 
					  for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
 | 
				
			||||||
  free(d -> a);
 | 
					  free(d -> a);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
 | 
					  zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
 | 
				
			||||||
  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
 | 
					  for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
 | 
				
			||||||
  free(d -> ap);
 | 
					  free(d -> ap);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
 | 
					  zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
 | 
				
			||||||
  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
 | 
					  for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
 | 
				
			||||||
  free(d -> alpha);
 | 
					  free(d -> alpha);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
 | 
					  zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
 | 
				
			||||||
  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
 | 
					  for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
 | 
				
			||||||
  free(d -> beta);
 | 
					  free(d -> beta);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
 | 
					  zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
 | 
				
			||||||
  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
 | 
					  for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
 | 
				
			||||||
  free(d -> gamma);
 | 
					  free(d -> gamma);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  free(d);
 | 
					  free(d);
 | 
				
			||||||
@@ -523,17 +523,17 @@ NAMESPACE_END(Grid);
 | 
				
			|||||||
#ifdef TEST
 | 
					#ifdef TEST
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#undef ZERO
 | 
					#undef ZERO
 | 
				
			||||||
#define ZERO ((PRECISION) 0)
 | 
					#define ZERO ((ZOLO_PRECISION) 0)
 | 
				
			||||||
#undef ONE
 | 
					#undef ONE
 | 
				
			||||||
#define ONE ((PRECISION) 1)
 | 
					#define ONE ((ZOLO_PRECISION) 1)
 | 
				
			||||||
#undef TWO
 | 
					#undef TWO
 | 
				
			||||||
#define TWO ((PRECISION) 2)
 | 
					#define TWO ((ZOLO_PRECISION) 2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* Evaluate the rational approximation R(x) using the factored form */
 | 
					/* Evaluate the rational approximation R(x) using the factored form */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
 | 
					static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
 | 
				
			||||||
  int m;
 | 
					  int m;
 | 
				
			||||||
  PRECISION R;
 | 
					  ZOLO_PRECISION R;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if (rdata -> type == 0) {
 | 
					  if (rdata -> type == 0) {
 | 
				
			||||||
    R = rdata -> A * x;
 | 
					    R = rdata -> A * x;
 | 
				
			||||||
@@ -551,9 +551,9 @@ static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
/* Evaluate the rational approximation R(x) using the partial fraction form */
 | 
					/* Evaluate the rational approximation R(x) using the partial fraction form */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
 | 
					static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
 | 
				
			||||||
  int m;
 | 
					  int m;
 | 
				
			||||||
  PRECISION R = rdata -> alpha[rdata -> da - 1];
 | 
					  ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
 | 
				
			||||||
  for (m = 0; m < rdata -> dd; m++)
 | 
					  for (m = 0; m < rdata -> dd; m++)
 | 
				
			||||||
    R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
 | 
					    R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
 | 
				
			||||||
  if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
 | 
					  if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
 | 
				
			||||||
@@ -568,18 +568,18 @@ static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
 | 
				
			|||||||
 * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
 | 
					 * non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
 | 
				
			||||||
 * but with signalling overflow you will get an error message. */
 | 
					 * but with signalling overflow you will get an error message. */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
 | 
					static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
 | 
				
			||||||
  int m;
 | 
					  int m;
 | 
				
			||||||
  PRECISION R = rdata -> beta[0] * x;
 | 
					  ZOLO_PRECISION R = rdata -> beta[0] * x;
 | 
				
			||||||
  for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
 | 
					  for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
 | 
				
			||||||
  return R;
 | 
					  return R;
 | 
				
			||||||
}    
 | 
					}    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* Evaluate the rational approximation R(x) using Cayley form */
 | 
					/* Evaluate the rational approximation R(x) using Cayley form */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
 | 
					static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
 | 
				
			||||||
  int m;
 | 
					  int m;
 | 
				
			||||||
  PRECISION T;
 | 
					  ZOLO_PRECISION T;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  T = rdata -> type == 0 ? ONE : -ONE;
 | 
					  T = rdata -> type == 0 ? ONE : -ONE;
 | 
				
			||||||
  for (m = 0; m < rdata -> n; m++)
 | 
					  for (m = 0; m < rdata -> n; m++)
 | 
				
			||||||
@@ -607,7 +607,7 @@ int main(int argc, char** argv) {
 | 
				
			|||||||
  int m, n, plotpts = 5000, type = 0;
 | 
					  int m, n, plotpts = 5000, type = 0;
 | 
				
			||||||
  float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
 | 
					  float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
 | 
				
			||||||
  zolotarev_data *rdata;
 | 
					  zolotarev_data *rdata;
 | 
				
			||||||
  PRECISION y;
 | 
					  ZOLO_PRECISION y;
 | 
				
			||||||
  FILE *plot_function, *plot_error, 
 | 
					  FILE *plot_function, *plot_error, 
 | 
				
			||||||
    *plot_partfrac, *plot_contfrac, *plot_cayley;
 | 
					    *plot_partfrac, *plot_contfrac, *plot_cayley;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -626,13 +626,13 @@ int main(int argc, char** argv) {
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  rdata = type == 2 
 | 
					  rdata = type == 2 
 | 
				
			||||||
    ? higham((PRECISION) eps, n) 
 | 
					    ? higham((ZOLO_PRECISION) eps, n) 
 | 
				
			||||||
    : zolotarev((PRECISION) eps, n, type);
 | 
					    : zolotarev((ZOLO_PRECISION) eps, n, type);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" 
 | 
					  printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t" 
 | 
				
			||||||
	 STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
 | 
						 STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
 | 
				
			||||||
	 "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
 | 
						 "\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
 | 
				
			||||||
	 "\tPRECISION = " STRINGIFY(PRECISION)
 | 
						 "\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
 | 
				
			||||||
	 "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
 | 
						 "\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
 | 
				
			||||||
	 "\tDelta = %g (maximum error)\n\n"
 | 
						 "\tDelta = %g (maximum error)\n\n"
 | 
				
			||||||
	 "\tA = %g (overall factor)\n",
 | 
						 "\tA = %g (overall factor)\n",
 | 
				
			||||||
@@ -681,15 +681,15 @@ int main(int argc, char** argv) {
 | 
				
			|||||||
    x = 2.4 * (float) m / plotpts - 1.2;
 | 
					    x = 2.4 * (float) m / plotpts - 1.2;
 | 
				
			||||||
    if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
 | 
					    if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
 | 
				
			||||||
      /* skip x = 0 for type 1, as R(0) is singular */
 | 
					      /* skip x = 0 for type 1, as R(0) is singular */
 | 
				
			||||||
      y = zolotarev_eval((PRECISION) x, rdata);
 | 
					      y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
 | 
				
			||||||
      fprintf(plot_function, "%g %g\n", x, (float) y);
 | 
					      fprintf(plot_function, "%g %g\n", x, (float) y);
 | 
				
			||||||
      fprintf(plot_error, "%g %g\n",
 | 
					      fprintf(plot_error, "%g %g\n",
 | 
				
			||||||
	      x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
 | 
						      x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
 | 
				
			||||||
      ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
 | 
					      ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
 | 
				
			||||||
		       / rdata -> Delta);
 | 
							       / rdata -> Delta);
 | 
				
			||||||
      ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
 | 
					      ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
 | 
				
			||||||
		       / rdata -> Delta);
 | 
							       / rdata -> Delta);
 | 
				
			||||||
      ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
 | 
					      ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
 | 
				
			||||||
		       / rdata -> Delta);
 | 
							       / rdata -> Delta);
 | 
				
			||||||
      if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
 | 
					      if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
 | 
				
			||||||
	maxypferr = MAX(maxypferr, fabs(ypferr));
 | 
						maxypferr = MAX(maxypferr, fabs(ypferr));
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx);
 | 
				
			|||||||
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
 | 
					#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef ZOLOTAREV_INTERNAL
 | 
					#ifndef ZOLOTAREV_INTERNAL
 | 
				
			||||||
#ifndef PRECISION
 | 
					#ifndef ZOLO_PRECISION
 | 
				
			||||||
#define PRECISION double
 | 
					#define ZOLO_PRECISION double
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#define ZPRECISION PRECISION
 | 
					#define ZPRECISION ZOLO_PRECISION
 | 
				
			||||||
#define ZOLOTAREV_DATA zolotarev_data
 | 
					#define ZOLOTAREV_DATA zolotarev_data
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -77,8 +77,8 @@ typedef struct {
 | 
				
			|||||||
 * zolotarev_data structure. The arguments must satisfy the constraints that
 | 
					 * zolotarev_data structure. The arguments must satisfy the constraints that
 | 
				
			||||||
 * epsilon > 0, n > 0, and type = 0 or 1. */
 | 
					 * epsilon > 0, n > 0, and type = 0 or 1. */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
 | 
					ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
 | 
				
			||||||
ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
 | 
					ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
 | 
				
			||||||
void zolotarev_free(zolotarev_data *zdata);
 | 
					void zolotarev_free(zolotarev_data *zdata);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -86,3 +86,4 @@ void zolotarev_free(zolotarev_data *zdata);
 | 
				
			|||||||
NAMESPACE_END(Approx);
 | 
					NAMESPACE_END(Approx);
 | 
				
			||||||
NAMESPACE_END(Grid);
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -34,9 +34,14 @@ Author: Peter Boyle <pboyle@bnl.gov>
 | 
				
			|||||||
#include <hipblas/hipblas.h>
 | 
					#include <hipblas/hipblas.h>
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#ifdef GRID_SYCL
 | 
					#ifdef GRID_SYCL
 | 
				
			||||||
#error // need oneMKL version
 | 
					#include <oneapi/mkl.hpp>
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#if 0
 | 
				
			||||||
 | 
					#define GRID_ONE_MKL
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#ifdef GRID_ONE_MKL
 | 
				
			||||||
 | 
					#include <oneapi/mkl.hpp>
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					 | 
				
			||||||
///////////////////////////////////////////////////////////////////////	  
 | 
					///////////////////////////////////////////////////////////////////////	  
 | 
				
			||||||
// Need to rearrange lattice data to be in the right format for a
 | 
					// Need to rearrange lattice data to be in the right format for a
 | 
				
			||||||
// batched multiply. Might as well make these static, dense packed
 | 
					// batched multiply. Might as well make these static, dense packed
 | 
				
			||||||
@@ -49,9 +54,12 @@ NAMESPACE_BEGIN(Grid);
 | 
				
			|||||||
  typedef cudablasHandle_t gridblasHandle_t;
 | 
					  typedef cudablasHandle_t gridblasHandle_t;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#ifdef GRID_SYCL
 | 
					#ifdef GRID_SYCL
 | 
				
			||||||
  typedef int32_t gridblasHandle_t;
 | 
					  typedef cl::sycl::queue *gridblasHandle_t;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
 | 
					#ifdef GRID_ONE_MKL
 | 
				
			||||||
 | 
					  typedef cl::sycl::queue *gridblasHandle_t;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
 | 
				
			||||||
  typedef int32_t gridblasHandle_t;
 | 
					  typedef int32_t gridblasHandle_t;
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -76,6 +84,12 @@ public:
 | 
				
			|||||||
      hipblasCreate(&gridblasHandle);
 | 
					      hipblasCreate(&gridblasHandle);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#ifdef GRID_SYCL
 | 
					#ifdef GRID_SYCL
 | 
				
			||||||
 | 
					      gridblasHandle = theGridAccelerator;
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#ifdef GRID_ONE_MKL
 | 
				
			||||||
 | 
					      cl::sycl::cpu_selector selector;
 | 
				
			||||||
 | 
					      cl::sycl::device selectedDevice { selector };
 | 
				
			||||||
 | 
					      gridblasHandle =new sycl::queue (selectedDevice);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
      gridblasInit=1;
 | 
					      gridblasInit=1;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -110,6 +124,9 @@ public:
 | 
				
			|||||||
#endif
 | 
					#endif
 | 
				
			||||||
#ifdef GRID_SYCL
 | 
					#ifdef GRID_SYCL
 | 
				
			||||||
    accelerator_barrier();
 | 
					    accelerator_barrier();
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					#ifdef GRID_ONE_MKL
 | 
				
			||||||
 | 
					    gridblasHandle->wait();
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
  }
 | 
					  }
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
@@ -644,10 +661,19 @@ public:
 | 
				
			|||||||
			      (cuDoubleComplex *) Cmn, ldc, sdc,
 | 
								      (cuDoubleComplex *) Cmn, ldc, sdc,
 | 
				
			||||||
			      batchCount);
 | 
								      batchCount);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#ifdef GRID_SYCL
 | 
					#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
 | 
				
			||||||
     #warning "oneMKL implementation not made "
 | 
					    oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
 | 
				
			||||||
 | 
											oneapi::mkl::transpose::N,
 | 
				
			||||||
 | 
											oneapi::mkl::transpose::N,
 | 
				
			||||||
 | 
											m,n,k,
 | 
				
			||||||
 | 
											alpha,
 | 
				
			||||||
 | 
											(const ComplexD *)Amk,lda,sda,
 | 
				
			||||||
 | 
											(const ComplexD *)Bkn,ldb,sdb,
 | 
				
			||||||
 | 
											beta,
 | 
				
			||||||
 | 
											(ComplexD *)Cmn,ldc,sdc,
 | 
				
			||||||
 | 
											batchCount);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
 | 
					#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
 | 
				
			||||||
     // Need a default/reference implementation
 | 
					     // Need a default/reference implementation
 | 
				
			||||||
     for (int p = 0; p < batchCount; ++p) {
 | 
					     for (int p = 0; p < batchCount; ++p) {
 | 
				
			||||||
       for (int mm = 0; mm < m; ++mm) {
 | 
					       for (int mm = 0; mm < m; ++mm) {
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -35,6 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#include <Grid/lattice/Lattice_transpose.h>
 | 
					#include <Grid/lattice/Lattice_transpose.h>
 | 
				
			||||||
#include <Grid/lattice/Lattice_local.h>
 | 
					#include <Grid/lattice/Lattice_local.h>
 | 
				
			||||||
#include <Grid/lattice/Lattice_reduction.h>
 | 
					#include <Grid/lattice/Lattice_reduction.h>
 | 
				
			||||||
 | 
					#include <Grid/lattice/Lattice_crc.h>
 | 
				
			||||||
#include <Grid/lattice/Lattice_peekpoke.h>
 | 
					#include <Grid/lattice/Lattice_peekpoke.h>
 | 
				
			||||||
#include <Grid/lattice/Lattice_reality.h>
 | 
					#include <Grid/lattice/Lattice_reality.h>
 | 
				
			||||||
#include <Grid/lattice/Lattice_real_imag.h>
 | 
					#include <Grid/lattice/Lattice_real_imag.h>
 | 
				
			||||||
@@ -46,5 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
#include <Grid/lattice/Lattice_unary.h>
 | 
					#include <Grid/lattice/Lattice_unary.h>
 | 
				
			||||||
#include <Grid/lattice/Lattice_transfer.h>
 | 
					#include <Grid/lattice/Lattice_transfer.h>
 | 
				
			||||||
#include <Grid/lattice/Lattice_basis.h>
 | 
					#include <Grid/lattice/Lattice_basis.h>
 | 
				
			||||||
#include <Grid/lattice/Lattice_crc.h>
 | 
					 | 
				
			||||||
#include <Grid/lattice/PaddedCell.h>
 | 
					#include <Grid/lattice/PaddedCell.h>
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -42,13 +42,13 @@ template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class vobj> uint32_t crc(Lattice<vobj> & buf)
 | 
					template<class vobj> uint32_t crc(const Lattice<vobj> & buf)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
  autoView( buf_v , buf, CpuRead);
 | 
					  autoView( buf_v , buf, CpuRead);
 | 
				
			||||||
  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 | 
					  return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
 | 
					#define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NAMESPACE_END(Grid);
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -285,6 +285,7 @@ template<class vobj>
 | 
				
			|||||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
 | 
					inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
 | 
				
			||||||
  GridBase *grid = left.Grid();
 | 
					  GridBase *grid = left.Grid();
 | 
				
			||||||
  ComplexD nrm = rankInnerProduct(left,right);
 | 
					  ComplexD nrm = rankInnerProduct(left,right);
 | 
				
			||||||
 | 
					  //  std::cerr<<"flight log " << std::hexfloat << nrm <<" "<<crc(left)<<std::endl;
 | 
				
			||||||
  grid->GlobalSum(nrm);
 | 
					  grid->GlobalSum(nrm);
 | 
				
			||||||
  return nrm;
 | 
					  return nrm;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1133,4 +1133,13 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
NAMESPACE_END(Grid);
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef GRID_SYCL
 | 
				
			||||||
 | 
					template<> struct sycl::is_device_copyable<Grid::vComplexF> : public std::true_type {};
 | 
				
			||||||
 | 
					template<> struct sycl::is_device_copyable<Grid::vComplexD> : public std::true_type {};
 | 
				
			||||||
 | 
					template<> struct sycl::is_device_copyable<Grid::vRealF   > : public std::true_type {};
 | 
				
			||||||
 | 
					template<> struct sycl::is_device_copyable<Grid::vRealD   > : public std::true_type {};
 | 
				
			||||||
 | 
					template<> struct sycl::is_device_copyable<Grid::vInteger > : public std::true_type {};
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -404,3 +404,12 @@ NAMESPACE_BEGIN(Grid);
 | 
				
			|||||||
  };
 | 
					  };
 | 
				
			||||||
NAMESPACE_END(Grid);
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef GRID_SYCL
 | 
				
			||||||
 | 
					template<typename T> struct
 | 
				
			||||||
 | 
					sycl::is_device_copyable<T, typename std::enable_if<
 | 
				
			||||||
 | 
								      Grid::isGridTensor<T>::value  && (!std::is_trivially_copyable<T>::value),
 | 
				
			||||||
 | 
								      void>::type>
 | 
				
			||||||
 | 
					  : public std::true_type {};
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -255,17 +255,13 @@ inline int  acceleratorIsCommunicable(void *ptr)
 | 
				
			|||||||
#define GRID_SYCL_LEVEL_ZERO_IPC
 | 
					#define GRID_SYCL_LEVEL_ZERO_IPC
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NAMESPACE_END(Grid);
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
#if 0
 | 
					
 | 
				
			||||||
#include <CL/sycl.hpp>
 | 
					// Force deterministic reductions
 | 
				
			||||||
#include <CL/sycl/usm.hpp>
 | 
					#define SYCL_REDUCTION_DETERMINISTIC
 | 
				
			||||||
#include <level_zero/ze_api.h>
 | 
					 | 
				
			||||||
#include <CL/sycl/backend/level_zero.hpp>
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
#include <sycl/CL/sycl.hpp>
 | 
					#include <sycl/CL/sycl.hpp>
 | 
				
			||||||
#include <sycl/usm.hpp>
 | 
					#include <sycl/usm.hpp>
 | 
				
			||||||
#include <level_zero/ze_api.h>
 | 
					#include <level_zero/ze_api.h>
 | 
				
			||||||
#include <sycl/ext/oneapi/backend/level_zero.hpp>
 | 
					#include <sycl/ext/oneapi/backend/level_zero.hpp>
 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
NAMESPACE_BEGIN(Grid);
 | 
					NAMESPACE_BEGIN(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -393,6 +393,9 @@ void Grid_init(int *argc,char ***argv)
 | 
				
			|||||||
  std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
 | 
					  std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
 | 
				
			||||||
  std::cout << GridLogMessage << "================================================ "<<std::endl;
 | 
					  std::cout << GridLogMessage << "================================================ "<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  char hostname[HOST_NAME_MAX+1];
 | 
				
			||||||
 | 
					  gethostname(hostname, HOST_NAME_MAX+1);
 | 
				
			||||||
 | 
					  std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  /////////////////////////////////////////////////////////
 | 
					  /////////////////////////////////////////////////////////
 | 
				
			||||||
  // Reporting
 | 
					  // Reporting
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -219,7 +219,7 @@ public:
 | 
				
			|||||||
    uint64_t NN;
 | 
					    uint64_t NN;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  uint64_t lmax=32;
 | 
					  uint64_t lmax=40;
 | 
				
			||||||
#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
 | 
					#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
					    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
				
			||||||
@@ -454,11 +454,17 @@ public:
 | 
				
			|||||||
      pickCheckerboard(Even,src_e,src);
 | 
					      pickCheckerboard(Even,src_e,src);
 | 
				
			||||||
      pickCheckerboard(Odd,src_o,src);
 | 
					      pickCheckerboard(Odd,src_o,src);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      const int num_cases = 1;
 | 
					#ifdef AVX512
 | 
				
			||||||
 | 
					      const int num_cases = 3;
 | 
				
			||||||
 | 
					#else 
 | 
				
			||||||
 | 
					      const int num_cases = 2;
 | 
				
			||||||
 | 
					#endif      
 | 
				
			||||||
      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
 | 
					      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      controls Cases [] = {
 | 
					      controls Cases [] = {
 | 
				
			||||||
	{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent }
 | 
						{  WilsonKernelsStatic::OptGeneric   ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent },
 | 
				
			||||||
 | 
						{  WilsonKernelsStatic::OptHandUnroll,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent },
 | 
				
			||||||
 | 
						{  WilsonKernelsStatic::OptInlineAsm ,  WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent }
 | 
				
			||||||
      }; 
 | 
					      }; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(int c=0;c<num_cases;c++) {
 | 
					      for(int c=0;c<num_cases;c++) {
 | 
				
			||||||
@@ -469,6 +475,10 @@ public:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
						std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
				
			||||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
						if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
				
			||||||
 | 
						if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using ASM      WilsonKernels" <<std::endl;
 | 
				
			||||||
 | 
						if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using UNROLLED WilsonKernels" <<std::endl;
 | 
				
			||||||
 | 
						if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 | 
				
			||||||
 | 
						if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
 | 
				
			||||||
	std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
						std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
				
			||||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
						std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -614,11 +624,13 @@ public:
 | 
				
			|||||||
      pickCheckerboard(Even,src_e,src);
 | 
					      pickCheckerboard(Even,src_e,src);
 | 
				
			||||||
      pickCheckerboard(Odd,src_o,src);
 | 
					      pickCheckerboard(Odd,src_o,src);
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
      const int num_cases = 1;
 | 
					      const int num_cases = 2;
 | 
				
			||||||
      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
 | 
					      std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
 | 
				
			||||||
      
 | 
					      
 | 
				
			||||||
      controls Cases [] = {
 | 
					      controls Cases [] = {
 | 
				
			||||||
	{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 | 
						{  StaggeredKernelsStatic::OptGeneric   ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 | 
				
			||||||
 | 
						{  StaggeredKernelsStatic::OptHandUnroll,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  },
 | 
				
			||||||
 | 
						{  StaggeredKernelsStatic::OptInlineAsm ,  StaggeredKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicyConcurrent  }
 | 
				
			||||||
      }; 
 | 
					      }; 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      for(int c=0;c<num_cases;c++) {
 | 
					      for(int c=0;c<num_cases;c++) {
 | 
				
			||||||
@@ -847,11 +859,8 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
 | 
					  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
 | 
				
			||||||
#ifdef KNL
 | 
					 | 
				
			||||||
  LebesgueOrder::Block = std::vector<int>({8,2,2,2});
 | 
					 | 
				
			||||||
#else
 | 
					 | 
				
			||||||
  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
 | 
					  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
 | 
				
			||||||
#endif
 | 
					
 | 
				
			||||||
  Benchmark::Decomposition();
 | 
					  Benchmark::Decomposition();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  int do_su4=0;
 | 
					  int do_su4=0;
 | 
				
			||||||
@@ -910,7 +919,7 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  }
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  if ( do_blas ) {
 | 
					  if ( do_blas ) {
 | 
				
			||||||
#if defined(GRID_CUDA) || defined(GRID_HIP)    
 | 
					#if defined(GRID_CUDA) || defined(GRID_HIP)     || defined(GRID_SYCL)   
 | 
				
			||||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
					    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
				
			||||||
    std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
 | 
					    std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
 | 
				
			||||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
					    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -25,12 +25,16 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
				
			||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
				
			||||||
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
					#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
				
			||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 | 
				
			||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
				
			||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
				
			||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
				
			||||||
export MPICH_OFI_NIC_POLICY=GPU
 | 
					export MPICH_OFI_NIC_POLICY=GPU
 | 
				
			||||||
 | 
					export FI_CXI_CQ_FILL_PERCENT=10
 | 
				
			||||||
 | 
					export FI_CXI_DEFAULT_CQ_SIZE=262144
 | 
				
			||||||
 | 
					#export FI_CXI_DEFAULT_CQ_SIZE=131072
 | 
				
			||||||
 | 
					#export FI_CXI_CQ_FILL_PERCENT=20
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# 12 ppn, 32 nodes, 384 ranks
 | 
					# 12 ppn, 32 nodes, 384 ranks
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
@@ -45,12 +49,12 @@ CMD="mpiexec -np 12288 -ppn 12  -envall \
 | 
				
			|||||||
	     ./gpu_tile_compact.sh \
 | 
						     ./gpu_tile_compact.sh \
 | 
				
			||||||
	     ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \
 | 
						     ./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \
 | 
				
			||||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
							--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
				
			||||||
$CMD | tee 1024node.dwf.small
 | 
					$CMD | tee 1024node.dwf.small.cq
 | 
				
			||||||
 | 
					
 | 
				
			||||||
CMD="mpiexec -np 12288 -ppn 12  -envall \
 | 
					CMD="mpiexec -np 12288 -ppn 12  -envall \
 | 
				
			||||||
	     ./gpu_tile_compact.sh \
 | 
						     ./gpu_tile_compact.sh \
 | 
				
			||||||
	     ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \
 | 
						     ./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \
 | 
				
			||||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
							--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
 | 
				
			||||||
$CMD | tee 1024node.dwf
 | 
					$CMD | tee 1024node.dwf.cq
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -17,6 +17,7 @@ source ../sourceme.sh
 | 
				
			|||||||
export OMP_NUM_THREADS=3
 | 
					export OMP_NUM_THREADS=3
 | 
				
			||||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
					export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 | 
					#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 | 
				
			||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 | 
					#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 | 
				
			||||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 | 
					#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 | 
				
			||||||
@@ -35,11 +36,25 @@ CMD="mpiexec -np 24 -ppn 12  -envall \
 | 
				
			|||||||
	     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
 | 
						     ./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
 | 
				
			||||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 | 
							--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
$CMD 
 | 
					#$CMD 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
CMD="mpiexec -np 24 -ppn 12  -envall \
 | 
					CMD="mpiexec -np 24 -ppn 12  -envall \
 | 
				
			||||||
	     ./gpu_tile_compact.sh \
 | 
						     ./gpu_tile_compact.sh \
 | 
				
			||||||
	     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
 | 
						     ./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
 | 
				
			||||||
		--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 | 
							--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#$CMD 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CMD="mpiexec -np 1 -ppn 1  -envall \
 | 
				
			||||||
 | 
						     ./gpu_tile_compact.sh \
 | 
				
			||||||
 | 
						     ./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
 | 
				
			||||||
 | 
							--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					$CMD 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CMD="mpiexec -np 1 -ppn 1  -envall \
 | 
				
			||||||
 | 
						     ./gpu_tile_compact.sh \
 | 
				
			||||||
 | 
						     ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
 | 
				
			||||||
 | 
							--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
$CMD 
 | 
					$CMD 
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -11,6 +11,6 @@ TOOLS=$HOME/tools
 | 
				
			|||||||
	--enable-unified=no \
 | 
						--enable-unified=no \
 | 
				
			||||||
	MPICXX=mpicxx \
 | 
						MPICXX=mpicxx \
 | 
				
			||||||
	CXX=icpx \
 | 
						CXX=icpx \
 | 
				
			||||||
	LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
 | 
						LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \
 | 
				
			||||||
	CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
 | 
						CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -3,6 +3,19 @@
 | 
				
			|||||||
module use /soft/modulefiles
 | 
					module use /soft/modulefiles
 | 
				
			||||||
module load intel_compute_runtime/release/agama-devel-682.22
 | 
					module load intel_compute_runtime/release/agama-devel-682.22
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export FI_CXI_DEFAULT_CQ_SIZE=131072
 | 
				
			||||||
 | 
					export FI_CXI_CQ_FILL_PERCENT=20
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
 | 
				
			||||||
 | 
					#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# -ftarget-register-alloc-mode=pvc:default 
 | 
				
			||||||
 | 
					# -ftarget-register-alloc-mode=pvc:small
 | 
				
			||||||
 | 
					# -ftarget-register-alloc-mode=pvc:large
 | 
				
			||||||
 | 
					# -ftarget-register-alloc-mode=pvc:auto
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 | 
					export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
 | 
				
			||||||
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 | 
					export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
 | 
				
			||||||
export http_proxy=http://proxy.alcf.anl.gov:3128
 | 
					export http_proxy=http://proxy.alcf.anl.gov:3128
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										40
									
								
								systems/Aurora/tests/repro16.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								systems/Aurora/tests/repro16.pbs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,40 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#PBS -q EarlyAppAccess
 | 
				
			||||||
 | 
					#PBS -l select=16
 | 
				
			||||||
 | 
					#PBS -l walltime=01:00:00
 | 
				
			||||||
 | 
					#PBS -A LatticeQCD_aesp_CNDA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#export OMP_PROC_BIND=spread
 | 
				
			||||||
 | 
					#unset OMP_PLACES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cd $PBS_O_WORKDIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					source ../sourceme.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cat $PBS_NODEFILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export OMP_NUM_THREADS=3
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 | 
				
			||||||
 | 
					#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 | 
				
			||||||
 | 
					#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
				
			||||||
 | 
					export MPICH_OFI_NIC_POLICY=GPU
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 12 ppn, 16 nodes, 192 ranks
 | 
				
			||||||
 | 
					CMD="mpiexec -np 192 -ppn 12  -envall \
 | 
				
			||||||
 | 
						     ./gpu_tile_compact.sh \
 | 
				
			||||||
 | 
						     ./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
 | 
				
			||||||
 | 
							--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
 | 
				
			||||||
 | 
					$CMD 
 | 
				
			||||||
							
								
								
									
										40
									
								
								systems/Aurora/tests/solver/stag16.pbs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								systems/Aurora/tests/solver/stag16.pbs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,40 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#PBS -q EarlyAppAccess
 | 
				
			||||||
 | 
					#PBS -l select=16
 | 
				
			||||||
 | 
					#PBS -l walltime=01:00:00
 | 
				
			||||||
 | 
					#PBS -A LatticeQCD_aesp_CNDA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#export OMP_PROC_BIND=spread
 | 
				
			||||||
 | 
					#unset OMP_PLACES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cd $PBS_O_WORKDIR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					source ../../sourceme.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cat $PBS_NODEFILE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export OMP_NUM_THREADS=3
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
 | 
				
			||||||
 | 
					#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
 | 
				
			||||||
 | 
					#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
 | 
				
			||||||
 | 
					export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
 | 
				
			||||||
 | 
					export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
 | 
				
			||||||
 | 
					export MPICH_OFI_NIC_POLICY=GPU
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 12 ppn, 16 nodes, 192 ranks
 | 
				
			||||||
 | 
					CMD="mpiexec -np 192 -ppn 12  -envall \
 | 
				
			||||||
 | 
						     ./gpu_tile_compact.sh \
 | 
				
			||||||
 | 
						     ./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
 | 
				
			||||||
 | 
						     --shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
 | 
				
			||||||
 | 
					$CMD 
 | 
				
			||||||
@@ -30,27 +30,16 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			|||||||
using namespace std;
 | 
					using namespace std;
 | 
				
			||||||
using namespace Grid;
 | 
					using namespace Grid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
template<class d>
 | 
					 | 
				
			||||||
struct scal {
 | 
					 | 
				
			||||||
  d internal;
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  Gamma::Algebra Gmu [] = {
 | 
					 | 
				
			||||||
    Gamma::Algebra::GammaX,
 | 
					 | 
				
			||||||
    Gamma::Algebra::GammaY,
 | 
					 | 
				
			||||||
    Gamma::Algebra::GammaZ,
 | 
					 | 
				
			||||||
    Gamma::Algebra::GammaT
 | 
					 | 
				
			||||||
  };
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
int main (int argc, char ** argv)
 | 
					int main (int argc, char ** argv)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
					  char hostname[HOST_NAME_MAX+1];
 | 
				
			||||||
 | 
					  gethostname(hostname, HOST_NAME_MAX+1);
 | 
				
			||||||
 | 
					  std::string host(hostname);
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
  Grid_init(&argc,&argv);
 | 
					  Grid_init(&argc,&argv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  const int Ls=12;
 | 
					  const int Ls=12;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  { 
 | 
					 | 
				
			||||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
 | 
					  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
 | 
				
			||||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
					  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
				
			||||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
					  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
				
			||||||
@@ -92,7 +81,14 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
 | 
					  SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
 | 
				
			||||||
  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
 | 
					  SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
 | 
					  int nsecs=600;
 | 
				
			||||||
 | 
					  if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
 | 
				
			||||||
 | 
					    std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds");
 | 
				
			||||||
 | 
					    GridCmdOptionInt(arg,nsecs);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
 | 
					  MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
 | 
				
			||||||
  double t1,t2,flops;
 | 
					  double t1,t2,flops;
 | 
				
			||||||
  double MdagMsiteflops = 1452; // Mobius (real coeffs)
 | 
					  double MdagMsiteflops = 1452; // Mobius (real coeffs)
 | 
				
			||||||
@@ -101,7 +97,14 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
  std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
 | 
					  std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
 | 
				
			||||||
  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
 | 
					  std:: cout << " CG    site flops = "<< CGsiteflops <<std::endl;
 | 
				
			||||||
  int iters;
 | 
					  int iters;
 | 
				
			||||||
  for(int i=0;i<10;i++){
 | 
					
 | 
				
			||||||
 | 
					  time_t start = time(NULL);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  uint32_t csum, csumref;
 | 
				
			||||||
 | 
					  csumref=0;
 | 
				
			||||||
 | 
					  int iter=0;
 | 
				
			||||||
 | 
					  do {
 | 
				
			||||||
 | 
					    std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
 | 
				
			||||||
    result_o = Zero();
 | 
					    result_o = Zero();
 | 
				
			||||||
    t1=usecond();
 | 
					    t1=usecond();
 | 
				
			||||||
    mCG(src_o,result_o);
 | 
					    mCG(src_o,result_o);
 | 
				
			||||||
@@ -111,10 +114,28 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
    flops+= CGsiteflops*FrbGrid->gSites()*iters;
 | 
					    flops+= CGsiteflops*FrbGrid->gSites()*iters;
 | 
				
			||||||
    std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
 | 
					    std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
 | 
				
			||||||
    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
 | 
					    std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
 | 
				
			||||||
  }
 | 
					
 | 
				
			||||||
  std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
 | 
					    csum = crc(result_o);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if ( csumref == 0 ) {
 | 
				
			||||||
 | 
					      csumref = csum;
 | 
				
			||||||
 | 
					    } else {
 | 
				
			||||||
 | 
					      if ( csum != csumref ) { 
 | 
				
			||||||
 | 
						std::cerr << host<<" FAILURE " <<iter <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
 | 
				
			||||||
 | 
						assert(0);
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
						std::cout << host <<" OK " <<iter <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    iter ++;
 | 
				
			||||||
 | 
					  } while (time(NULL) < (start + nsecs/2) );
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					  std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
 | 
				
			||||||
  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
 | 
					  ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
 | 
				
			||||||
  for(int i=0;i<1;i++){
 | 
					  csumref=0;
 | 
				
			||||||
 | 
					  int i=0;
 | 
				
			||||||
 | 
					  do { 
 | 
				
			||||||
 | 
					    std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
 | 
				
			||||||
    result_o_2 = Zero();
 | 
					    result_o_2 = Zero();
 | 
				
			||||||
    t1=usecond();
 | 
					    t1=usecond();
 | 
				
			||||||
    CG(HermOpEO,src_o,result_o_2);
 | 
					    CG(HermOpEO,src_o,result_o_2);
 | 
				
			||||||
@@ -125,43 +146,27 @@ int main (int argc, char ** argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
 | 
					    std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
 | 
				
			||||||
    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
 | 
					    std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  //  MemoryManager::Print();
 | 
					    csum = crc(result_o);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if ( csumref == 0 ) {
 | 
				
			||||||
 | 
					      csumref = csum;
 | 
				
			||||||
 | 
					    } else {
 | 
				
			||||||
 | 
					      if ( csum != csumref ) { 
 | 
				
			||||||
 | 
						std::cerr << i <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
 | 
				
			||||||
 | 
						assert(0);
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
						std::cout << i <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    i++;
 | 
				
			||||||
 | 
					  } while (time(NULL) < (start + nsecs) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  LatticeFermionD diff_o(FrbGrid);
 | 
					  LatticeFermionD diff_o(FrbGrid);
 | 
				
			||||||
  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
 | 
					  RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
 | 
					  std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
 | 
				
			||||||
 | 
					  assert(diff < 1e-4);
 | 
				
			||||||
  #ifdef HAVE_LIME
 | 
					 | 
				
			||||||
  if( GridCmdOptionExists(argv,argv+argc,"--checksums") ){
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  std::string file1("./Propagator1");
 | 
					 | 
				
			||||||
  emptyUserRecord record;
 | 
					 | 
				
			||||||
  uint32_t nersc_csum;
 | 
					 | 
				
			||||||
  uint32_t scidac_csuma;
 | 
					 | 
				
			||||||
  uint32_t scidac_csumb;
 | 
					 | 
				
			||||||
  typedef SpinColourVectorD   FermionD;
 | 
					 | 
				
			||||||
  typedef vSpinColourVectorD vFermionD;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  BinarySimpleMunger<FermionD,FermionD> munge;
 | 
					 | 
				
			||||||
  std::string format = getFormatString<vFermionD>();
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o,file1,munge, 0, format,
 | 
					 | 
				
			||||||
						   nersc_csum,scidac_csuma,scidac_csumb);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  std::cout << GridLogMessage << " Mixed checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o_2,file1,munge, 0, format,
 | 
					 | 
				
			||||||
						   nersc_csum,scidac_csuma,scidac_csumb);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  std::cout << GridLogMessage << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  #endif
 | 
					 | 
				
			||||||
  }
 | 
					 | 
				
			||||||
  
 | 
					 | 
				
			||||||
  MemoryManager::Print();
 | 
					 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
  Grid_finalize();
 | 
					  Grid_finalize();
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user