mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-20 16:56:55 +01:00
Compare commits
54 Commits
04ca065281
...
feature/gp
Author | SHA1 | Date | |
---|---|---|---|
da59379612 | |||
3ef2a41518 | |||
aa96f420c6 | |||
49e9e4ed0e | |||
f7b8163016 | |||
93769eacd3 | |||
59b0cc11df | |||
f32c275376 | |||
5404fc66ab | |||
1f53458af8 | |||
434c3e7f1d | |||
500b119f3d | |||
4b87259c1b | |||
503dec34ef | |||
d1e9fe50d2 | |||
d01e5fa838 | |||
a477c25e8c | |||
1bd20cd9e8 | |||
e49e95b037 | |||
6f59fed563 | |||
60b7f6c99d | |||
b92dfcc8d3 | |||
f6fd6dd053 | |||
79ad567dd5 | |||
fab1efb48c | |||
660eb76d93 | |||
62e7bf024a | |||
95f3d69cf9 | |||
89c0519f83 | |||
2704b82084 | |||
cf8632bbac | |||
d224297972 | |||
a4d11a630f | |||
2b4399f8b1 | |||
f17b8de907 | |||
7e5bd46dd3 | |||
228bbb9d81 | |||
b812a7b4c6 | |||
891a366f73 | |||
10116b3be8 | |||
a46a0f0882 | |||
a26a8a38f4 | |||
7435315d50 | |||
9b5f741e85 | |||
517822fdd2 | |||
1b93a9be88 | |||
783a66b348 | |||
976c3e9b59 | |||
f8ca971dae | |||
21bc8c24df | |||
30228214f7 | |||
2ae980ae43 | |||
6153dec2e4 | |||
c805f86343 |
@ -34,7 +34,7 @@
|
|||||||
#pragma push_macro("__SYCL_DEVICE_ONLY__")
|
#pragma push_macro("__SYCL_DEVICE_ONLY__")
|
||||||
#undef __SYCL_DEVICE_ONLY__
|
#undef __SYCL_DEVICE_ONLY__
|
||||||
#define EIGEN_DONT_VECTORIZE
|
#define EIGEN_DONT_VECTORIZE
|
||||||
//#undef EIGEN_USE_SYCL
|
#undef EIGEN_USE_SYCL
|
||||||
#define __SYCL__REDEFINE__
|
#define __SYCL__REDEFINE__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define _GRID_FFT_H_
|
#define _GRID_FFT_H_
|
||||||
|
|
||||||
#ifdef HAVE_FFTW
|
#ifdef HAVE_FFTW
|
||||||
#ifdef USE_MKL
|
#if defined(USE_MKL) || defined(GRID_SYCL)
|
||||||
#include <fftw/fftw3.h>
|
#include <fftw/fftw3.h>
|
||||||
#else
|
#else
|
||||||
#include <fftw3.h>
|
#include <fftw3.h>
|
||||||
|
@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
|
|||||||
* Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
|
* Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
|
||||||
* type = 1 for the approximation which is infinite at x = 0. */
|
* type = 1 for the approximation which is infinite at x = 0. */
|
||||||
|
|
||||||
zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
|
zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
|
||||||
INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
|
INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
|
||||||
l, invlambda, xi, xisq, *tv, s, opl;
|
l, invlambda, xi, xisq, *tv, s, opl;
|
||||||
int m, czero, ts;
|
int m, czero, ts;
|
||||||
@ -375,12 +375,12 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
|
|||||||
construct_partfrac(d);
|
construct_partfrac(d);
|
||||||
construct_contfrac(d);
|
construct_contfrac(d);
|
||||||
|
|
||||||
/* Converting everything to PRECISION for external use only */
|
/* Converting everything to ZOLO_PRECISION for external use only */
|
||||||
|
|
||||||
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
|
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
|
||||||
zd -> A = (PRECISION) d -> A;
|
zd -> A = (ZOLO_PRECISION) d -> A;
|
||||||
zd -> Delta = (PRECISION) d -> Delta;
|
zd -> Delta = (ZOLO_PRECISION) d -> Delta;
|
||||||
zd -> epsilon = (PRECISION) d -> epsilon;
|
zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
|
||||||
zd -> n = d -> n;
|
zd -> n = d -> n;
|
||||||
zd -> type = d -> type;
|
zd -> type = d -> type;
|
||||||
zd -> dn = d -> dn;
|
zd -> dn = d -> dn;
|
||||||
@ -390,24 +390,24 @@ zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
|
|||||||
zd -> deg_num = d -> deg_num;
|
zd -> deg_num = d -> deg_num;
|
||||||
zd -> deg_denom = d -> deg_denom;
|
zd -> deg_denom = d -> deg_denom;
|
||||||
|
|
||||||
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
|
zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
|
||||||
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
|
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
|
||||||
free(d -> a);
|
free(d -> a);
|
||||||
|
|
||||||
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
|
zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
|
||||||
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
|
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
|
||||||
free(d -> ap);
|
free(d -> ap);
|
||||||
|
|
||||||
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
|
zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
|
||||||
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
|
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
|
||||||
free(d -> alpha);
|
free(d -> alpha);
|
||||||
|
|
||||||
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
|
zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
|
||||||
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
|
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
|
||||||
free(d -> beta);
|
free(d -> beta);
|
||||||
|
|
||||||
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
|
zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
|
||||||
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
|
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
|
||||||
free(d -> gamma);
|
free(d -> gamma);
|
||||||
|
|
||||||
free(d);
|
free(d);
|
||||||
@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
zolotarev_data* higham(PRECISION epsilon, int n) {
|
zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
|
||||||
INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
|
INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
|
||||||
int m, czero;
|
int m, czero;
|
||||||
zolotarev_data *zd;
|
zolotarev_data *zd;
|
||||||
@ -481,9 +481,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
|
|||||||
/* Converting everything to PRECISION for external use only */
|
/* Converting everything to PRECISION for external use only */
|
||||||
|
|
||||||
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
|
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
|
||||||
zd -> A = (PRECISION) d -> A;
|
zd -> A = (ZOLO_PRECISION) d -> A;
|
||||||
zd -> Delta = (PRECISION) d -> Delta;
|
zd -> Delta = (ZOLO_PRECISION) d -> Delta;
|
||||||
zd -> epsilon = (PRECISION) d -> epsilon;
|
zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
|
||||||
zd -> n = d -> n;
|
zd -> n = d -> n;
|
||||||
zd -> type = d -> type;
|
zd -> type = d -> type;
|
||||||
zd -> dn = d -> dn;
|
zd -> dn = d -> dn;
|
||||||
@ -493,24 +493,24 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
|
|||||||
zd -> deg_num = d -> deg_num;
|
zd -> deg_num = d -> deg_num;
|
||||||
zd -> deg_denom = d -> deg_denom;
|
zd -> deg_denom = d -> deg_denom;
|
||||||
|
|
||||||
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
|
zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
|
||||||
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
|
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
|
||||||
free(d -> a);
|
free(d -> a);
|
||||||
|
|
||||||
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
|
zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
|
||||||
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
|
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
|
||||||
free(d -> ap);
|
free(d -> ap);
|
||||||
|
|
||||||
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
|
zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
|
||||||
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
|
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
|
||||||
free(d -> alpha);
|
free(d -> alpha);
|
||||||
|
|
||||||
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
|
zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
|
||||||
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
|
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
|
||||||
free(d -> beta);
|
free(d -> beta);
|
||||||
|
|
||||||
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
|
zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
|
||||||
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
|
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
|
||||||
free(d -> gamma);
|
free(d -> gamma);
|
||||||
|
|
||||||
free(d);
|
free(d);
|
||||||
@ -523,17 +523,17 @@ NAMESPACE_END(Grid);
|
|||||||
#ifdef TEST
|
#ifdef TEST
|
||||||
|
|
||||||
#undef ZERO
|
#undef ZERO
|
||||||
#define ZERO ((PRECISION) 0)
|
#define ZERO ((ZOLO_PRECISION) 0)
|
||||||
#undef ONE
|
#undef ONE
|
||||||
#define ONE ((PRECISION) 1)
|
#define ONE ((ZOLO_PRECISION) 1)
|
||||||
#undef TWO
|
#undef TWO
|
||||||
#define TWO ((PRECISION) 2)
|
#define TWO ((ZOLO_PRECISION) 2)
|
||||||
|
|
||||||
/* Evaluate the rational approximation R(x) using the factored form */
|
/* Evaluate the rational approximation R(x) using the factored form */
|
||||||
|
|
||||||
static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
|
static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
|
||||||
int m;
|
int m;
|
||||||
PRECISION R;
|
ZOLO_PRECISION R;
|
||||||
|
|
||||||
if (rdata -> type == 0) {
|
if (rdata -> type == 0) {
|
||||||
R = rdata -> A * x;
|
R = rdata -> A * x;
|
||||||
@ -551,9 +551,9 @@ static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
|
|||||||
|
|
||||||
/* Evaluate the rational approximation R(x) using the partial fraction form */
|
/* Evaluate the rational approximation R(x) using the partial fraction form */
|
||||||
|
|
||||||
static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
|
static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
|
||||||
int m;
|
int m;
|
||||||
PRECISION R = rdata -> alpha[rdata -> da - 1];
|
ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
|
||||||
for (m = 0; m < rdata -> dd; m++)
|
for (m = 0; m < rdata -> dd; m++)
|
||||||
R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
|
R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
|
||||||
if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
|
if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
|
||||||
@ -568,18 +568,18 @@ static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
|
|||||||
* non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
|
* non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
|
||||||
* but with signalling overflow you will get an error message. */
|
* but with signalling overflow you will get an error message. */
|
||||||
|
|
||||||
static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
|
static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
|
||||||
int m;
|
int m;
|
||||||
PRECISION R = rdata -> beta[0] * x;
|
ZOLO_PRECISION R = rdata -> beta[0] * x;
|
||||||
for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
|
for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
|
||||||
return R;
|
return R;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Evaluate the rational approximation R(x) using Cayley form */
|
/* Evaluate the rational approximation R(x) using Cayley form */
|
||||||
|
|
||||||
static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
|
static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
|
||||||
int m;
|
int m;
|
||||||
PRECISION T;
|
ZOLO_PRECISION T;
|
||||||
|
|
||||||
T = rdata -> type == 0 ? ONE : -ONE;
|
T = rdata -> type == 0 ? ONE : -ONE;
|
||||||
for (m = 0; m < rdata -> n; m++)
|
for (m = 0; m < rdata -> n; m++)
|
||||||
@ -607,7 +607,7 @@ int main(int argc, char** argv) {
|
|||||||
int m, n, plotpts = 5000, type = 0;
|
int m, n, plotpts = 5000, type = 0;
|
||||||
float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
|
float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
|
||||||
zolotarev_data *rdata;
|
zolotarev_data *rdata;
|
||||||
PRECISION y;
|
ZOLO_PRECISION y;
|
||||||
FILE *plot_function, *plot_error,
|
FILE *plot_function, *plot_error,
|
||||||
*plot_partfrac, *plot_contfrac, *plot_cayley;
|
*plot_partfrac, *plot_contfrac, *plot_cayley;
|
||||||
|
|
||||||
@ -626,13 +626,13 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
rdata = type == 2
|
rdata = type == 2
|
||||||
? higham((PRECISION) eps, n)
|
? higham((ZOLO_PRECISION) eps, n)
|
||||||
: zolotarev((PRECISION) eps, n, type);
|
: zolotarev((ZOLO_PRECISION) eps, n, type);
|
||||||
|
|
||||||
printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t"
|
printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t"
|
||||||
STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
|
STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
|
||||||
"\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
|
"\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
|
||||||
"\tPRECISION = " STRINGIFY(PRECISION)
|
"\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
|
||||||
"\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
|
"\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
|
||||||
"\tDelta = %g (maximum error)\n\n"
|
"\tDelta = %g (maximum error)\n\n"
|
||||||
"\tA = %g (overall factor)\n",
|
"\tA = %g (overall factor)\n",
|
||||||
@ -681,15 +681,15 @@ int main(int argc, char** argv) {
|
|||||||
x = 2.4 * (float) m / plotpts - 1.2;
|
x = 2.4 * (float) m / plotpts - 1.2;
|
||||||
if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
|
if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
|
||||||
/* skip x = 0 for type 1, as R(0) is singular */
|
/* skip x = 0 for type 1, as R(0) is singular */
|
||||||
y = zolotarev_eval((PRECISION) x, rdata);
|
y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
|
||||||
fprintf(plot_function, "%g %g\n", x, (float) y);
|
fprintf(plot_function, "%g %g\n", x, (float) y);
|
||||||
fprintf(plot_error, "%g %g\n",
|
fprintf(plot_error, "%g %g\n",
|
||||||
x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
|
x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
|
||||||
ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
|
ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
|
||||||
/ rdata -> Delta);
|
/ rdata -> Delta);
|
||||||
ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
|
ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
|
||||||
/ rdata -> Delta);
|
/ rdata -> Delta);
|
||||||
ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
|
ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
|
||||||
/ rdata -> Delta);
|
/ rdata -> Delta);
|
||||||
if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
|
if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
|
||||||
maxypferr = MAX(maxypferr, fabs(ypferr));
|
maxypferr = MAX(maxypferr, fabs(ypferr));
|
||||||
|
@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx);
|
|||||||
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
|
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
|
||||||
|
|
||||||
#ifndef ZOLOTAREV_INTERNAL
|
#ifndef ZOLOTAREV_INTERNAL
|
||||||
#ifndef PRECISION
|
#ifndef ZOLO_PRECISION
|
||||||
#define PRECISION double
|
#define ZOLO_PRECISION double
|
||||||
#endif
|
#endif
|
||||||
#define ZPRECISION PRECISION
|
#define ZPRECISION ZOLO_PRECISION
|
||||||
#define ZOLOTAREV_DATA zolotarev_data
|
#define ZOLOTAREV_DATA zolotarev_data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -77,8 +77,8 @@ typedef struct {
|
|||||||
* zolotarev_data structure. The arguments must satisfy the constraints that
|
* zolotarev_data structure. The arguments must satisfy the constraints that
|
||||||
* epsilon > 0, n > 0, and type = 0 or 1. */
|
* epsilon > 0, n > 0, and type = 0 or 1. */
|
||||||
|
|
||||||
ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
|
ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
|
||||||
ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
|
ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
|
||||||
void zolotarev_free(zolotarev_data *zdata);
|
void zolotarev_free(zolotarev_data *zdata);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -86,3 +86,4 @@ void zolotarev_free(zolotarev_data *zdata);
|
|||||||
NAMESPACE_END(Approx);
|
NAMESPACE_END(Approx);
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
34
Grid/algorithms/blas/BatchedBlas.cc
Normal file
34
Grid/algorithms/blas/BatchedBlas.cc
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: BatchedBlas.h
|
||||||
|
|
||||||
|
Copyright (C) 2023
|
||||||
|
|
||||||
|
Author: Peter Boyle <pboyle@bnl.gov>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/GridCore.h>
|
||||||
|
#include <Grid/algorithms/blas/BatchedBlas.h>
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
gridblasHandle_t GridBLAS::gridblasHandle;
|
||||||
|
int GridBLAS::gridblasInit;
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
@ -31,12 +31,17 @@ Author: Peter Boyle <pboyle@bnl.gov>
|
|||||||
#include <hipblas/hipblas.h>
|
#include <hipblas/hipblas.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
#include <hipblas/hipblas.h>
|
#include <cublas_v2.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
#error // need oneMKL version
|
#include <oneapi/mkl.hpp>
|
||||||
|
#endif
|
||||||
|
#if 0
|
||||||
|
#define GRID_ONE_MKL
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_ONE_MKL
|
||||||
|
#include <oneapi/mkl.hpp>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////
|
||||||
// Need to rearrange lattice data to be in the right format for a
|
// Need to rearrange lattice data to be in the right format for a
|
||||||
// batched multiply. Might as well make these static, dense packed
|
// batched multiply. Might as well make these static, dense packed
|
||||||
@ -46,12 +51,15 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
typedef hipblasHandle_t gridblasHandle_t;
|
typedef hipblasHandle_t gridblasHandle_t;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
typedef cudablasHandle_t gridblasHandle_t;
|
typedef cublasHandle_t gridblasHandle_t;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
typedef int32_t gridblasHandle_t;
|
typedef cl::sycl::queue *gridblasHandle_t;
|
||||||
#endif
|
#endif
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
#ifdef GRID_ONE_MKL
|
||||||
|
typedef cl::sycl::queue *gridblasHandle_t;
|
||||||
|
#endif
|
||||||
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
|
||||||
typedef int32_t gridblasHandle_t;
|
typedef int32_t gridblasHandle_t;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -70,12 +78,19 @@ public:
|
|||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
std::cout << "cublasCreate"<<std::endl;
|
std::cout << "cublasCreate"<<std::endl;
|
||||||
cublasCreate(&gridblasHandle);
|
cublasCreate(&gridblasHandle);
|
||||||
|
cublasSetPointerMode(gridblasHandle, CUBLAS_POINTER_MODE_DEVICE);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
std::cout << "hipblasCreate"<<std::endl;
|
std::cout << "hipblasCreate"<<std::endl;
|
||||||
hipblasCreate(&gridblasHandle);
|
hipblasCreate(&gridblasHandle);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
|
gridblasHandle = theGridAccelerator;
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_ONE_MKL
|
||||||
|
cl::sycl::cpu_selector selector;
|
||||||
|
cl::sycl::device selectedDevice { selector };
|
||||||
|
gridblasHandle =new sycl::queue (selectedDevice);
|
||||||
#endif
|
#endif
|
||||||
gridblasInit=1;
|
gridblasInit=1;
|
||||||
}
|
}
|
||||||
@ -110,6 +125,9 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_ONE_MKL
|
||||||
|
gridblasHandle->wait();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -615,9 +633,10 @@ public:
|
|||||||
deviceVector<ComplexD> beta_p(1);
|
deviceVector<ComplexD> beta_p(1);
|
||||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
||||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
||||||
std::cout << "blasZgemmStridedBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
|
||||||
std::cout << "blasZgemmStridedBatched ld "<<lda<<","<<ldb<<","<<ldc<<std::endl;
|
// std::cout << "blasZgemmStridedBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
|
||||||
std::cout << "blasZgemmStridedBatched sd "<<sda<<","<<sdb<<","<<sdc<<std::endl;
|
// std::cout << "blasZgemmStridedBatched ld "<<lda<<","<<ldb<<","<<ldc<<std::endl;
|
||||||
|
// std::cout << "blasZgemmStridedBatched sd "<<sda<<","<<sdb<<","<<sdc<<std::endl;
|
||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
auto err = hipblasZgemmStridedBatched(gridblasHandle,
|
auto err = hipblasZgemmStridedBatched(gridblasHandle,
|
||||||
HIPBLAS_OP_N,
|
HIPBLAS_OP_N,
|
||||||
@ -643,10 +662,19 @@ public:
|
|||||||
(cuDoubleComplex *) Cmn, ldc, sdc,
|
(cuDoubleComplex *) Cmn, ldc, sdc,
|
||||||
batchCount);
|
batchCount);
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
|
||||||
#warning "oneMKL implementation not made "
|
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||||
|
oneapi::mkl::transpose::N,
|
||||||
|
oneapi::mkl::transpose::N,
|
||||||
|
m,n,k,
|
||||||
|
alpha,
|
||||||
|
(const ComplexD *)Amk,lda,sda,
|
||||||
|
(const ComplexD *)Bkn,ldb,sdb,
|
||||||
|
beta,
|
||||||
|
(ComplexD *)Cmn,ldc,sdc,
|
||||||
|
batchCount);
|
||||||
#endif
|
#endif
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
|
||||||
// Need a default/reference implementation
|
// Need a default/reference implementation
|
||||||
for (int p = 0; p < batchCount; ++p) {
|
for (int p = 0; p < batchCount; ++p) {
|
||||||
for (int mm = 0; mm < m; ++mm) {
|
for (int mm = 0; mm < m; ++mm) {
|
||||||
@ -672,21 +700,23 @@ public:
|
|||||||
ComplexD alpha(1.0);
|
ComplexD alpha(1.0);
|
||||||
ComplexD beta (1.0);
|
ComplexD beta (1.0);
|
||||||
RealD flops = 8.0*M*N*K*BATCH;
|
RealD flops = 8.0*M*N*K*BATCH;
|
||||||
for(int i=0;i<10;i++){
|
int ncall=10;
|
||||||
RealD t0 = usecond();
|
RealD t0 = usecond();
|
||||||
gemmStridedBatched(M,N,K,
|
for(int i=0;i<ncall;i++){
|
||||||
alpha,
|
gemmStridedBatched(M,N,K,
|
||||||
&A[0], // m x k
|
alpha,
|
||||||
&B[0], // k x n
|
&A[0], // m x k
|
||||||
beta,
|
&B[0], // k x n
|
||||||
&C[0], // m x n
|
beta,
|
||||||
BATCH);
|
&C[0], // m x n
|
||||||
synchronise();
|
BATCH);
|
||||||
RealD t1 = usecond();
|
|
||||||
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
|
|
||||||
flops = flops/(t1-t0)/1.e3;
|
|
||||||
}
|
}
|
||||||
return flops;
|
synchronise();
|
||||||
|
RealD t1 = usecond();
|
||||||
|
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
|
||||||
|
flops = 8.0*M*N*K*BATCH*ncall;
|
||||||
|
flops = flops/(t1-t0)/1.e3;
|
||||||
|
return flops; // Returns gigaflops
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -348,6 +348,7 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|||||||
return offbytes;
|
return offbytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#undef NVLINK_GET // Define to use get instead of put DMA
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int dest,int dox,
|
int dest,int dox,
|
||||||
@ -380,9 +381,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
list.push_back(rrq);
|
list.push_back(rrq);
|
||||||
off_node_bytes+=rbytes;
|
off_node_bytes+=rbytes;
|
||||||
}
|
}
|
||||||
|
#ifdef NVLINK_GET
|
||||||
|
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
||||||
|
assert(shm!=NULL);
|
||||||
|
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dox) {
|
if (dox) {
|
||||||
|
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||||
tag= dir+_processor*32;
|
tag= dir+_processor*32;
|
||||||
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||||
@ -390,9 +397,12 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
list.push_back(xrq);
|
list.push_back(xrq);
|
||||||
off_node_bytes+=xbytes;
|
off_node_bytes+=xbytes;
|
||||||
} else {
|
} else {
|
||||||
|
#ifndef NVLINK_GET
|
||||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||||
assert(shm!=NULL);
|
assert(shm!=NULL);
|
||||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -402,6 +412,8 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
|
|||||||
{
|
{
|
||||||
int nreq=list.size();
|
int nreq=list.size();
|
||||||
|
|
||||||
|
acceleratorCopySynchronise();
|
||||||
|
|
||||||
if (nreq==0) return;
|
if (nreq==0) return;
|
||||||
|
|
||||||
std::vector<MPI_Status> status(nreq);
|
std::vector<MPI_Status> status(nreq);
|
||||||
|
@ -40,6 +40,9 @@ int GlobalSharedMemory::_ShmAlloc;
|
|||||||
uint64_t GlobalSharedMemory::_ShmAllocBytes;
|
uint64_t GlobalSharedMemory::_ShmAllocBytes;
|
||||||
|
|
||||||
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
|
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
void * GlobalSharedMemory::HostCommBuf;
|
||||||
|
#endif
|
||||||
|
|
||||||
Grid_MPI_Comm GlobalSharedMemory::WorldShmComm;
|
Grid_MPI_Comm GlobalSharedMemory::WorldShmComm;
|
||||||
int GlobalSharedMemory::WorldShmRank;
|
int GlobalSharedMemory::WorldShmRank;
|
||||||
@ -66,6 +69,26 @@ void GlobalSharedMemory::SharedMemoryFree(void)
|
|||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
// Alloc, free shmem region
|
// Alloc, free shmem region
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
void *SharedMemory::HostBufferMalloc(size_t bytes){
|
||||||
|
void *ptr = (void *)host_heap_top;
|
||||||
|
host_heap_top += bytes;
|
||||||
|
host_heap_bytes+= bytes;
|
||||||
|
if (host_heap_bytes >= host_heap_size) {
|
||||||
|
std::cout<< " HostBufferMalloc exceeded heap size -- try increasing with --shm <MB> flag" <<std::endl;
|
||||||
|
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
|
||||||
|
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
|
||||||
|
std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
|
||||||
|
std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
|
||||||
|
assert(host_heap_bytes<host_heap_size);
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void SharedMemory::HostBufferFreeAll(void) {
|
||||||
|
host_heap_top =(size_t)HostCommBuf;
|
||||||
|
host_heap_bytes=0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
void *SharedMemory::ShmBufferMalloc(size_t bytes){
|
void *SharedMemory::ShmBufferMalloc(size_t bytes){
|
||||||
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
|
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
|
||||||
void *ptr = (void *)heap_top;
|
void *ptr = (void *)heap_top;
|
||||||
|
@ -75,7 +75,9 @@ public:
|
|||||||
static int Hugepages;
|
static int Hugepages;
|
||||||
|
|
||||||
static std::vector<void *> WorldShmCommBufs;
|
static std::vector<void *> WorldShmCommBufs;
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
static void *HostCommBuf;
|
||||||
|
#endif
|
||||||
static Grid_MPI_Comm WorldComm;
|
static Grid_MPI_Comm WorldComm;
|
||||||
static int WorldRank;
|
static int WorldRank;
|
||||||
static int WorldSize;
|
static int WorldSize;
|
||||||
@ -120,6 +122,13 @@ private:
|
|||||||
size_t heap_bytes;
|
size_t heap_bytes;
|
||||||
size_t heap_size;
|
size_t heap_size;
|
||||||
|
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
size_t host_heap_top; // set in free all
|
||||||
|
size_t host_heap_bytes;// set in free all
|
||||||
|
void *HostCommBuf; // set in SetCommunicator
|
||||||
|
size_t host_heap_size; // set in SetCommunicator
|
||||||
|
#endif
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
Grid_MPI_Comm ShmComm; // for barriers
|
Grid_MPI_Comm ShmComm; // for barriers
|
||||||
@ -151,7 +160,10 @@ public:
|
|||||||
void *ShmBufferTranslate(int rank,void * local_p);
|
void *ShmBufferTranslate(int rank,void * local_p);
|
||||||
void *ShmBufferMalloc(size_t bytes);
|
void *ShmBufferMalloc(size_t bytes);
|
||||||
void ShmBufferFreeAll(void) ;
|
void ShmBufferFreeAll(void) ;
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
void *HostBufferMalloc(size_t bytes);
|
||||||
|
void HostBufferFreeAll(void);
|
||||||
|
#endif
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// Make info on Nodes & ranks and Shared memory available
|
// Make info on Nodes & ranks and Shared memory available
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -39,10 +39,12 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
|||||||
#include <hip/hip_runtime_api.h>
|
#include <hip/hip_runtime_api.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
#define GRID_SYCL_LEVEL_ZERO_IPC
|
#define GRID_SYCL_LEVEL_ZERO_IPC
|
||||||
#include <syscall.h>
|
|
||||||
#define SHM_SOCKETS
|
#define SHM_SOCKETS
|
||||||
#endif
|
#endif
|
||||||
|
#include <syscall.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <sys/socket.h>
|
#include <sys/socket.h>
|
||||||
#include <sys/un.h>
|
#include <sys/un.h>
|
||||||
@ -512,46 +514,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// Hugetlbfs mapping intended
|
// Hugetlbfs mapping intended
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
|
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL)
|
||||||
|
|
||||||
//if defined(GRID_SYCL)
|
|
||||||
#if 0
|
|
||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|
||||||
{
|
|
||||||
void * ShmCommBuf ;
|
|
||||||
assert(_ShmSetup==1);
|
|
||||||
assert(_ShmAlloc==0);
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// allocate the pointer array for shared windows for our group
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
MPI_Barrier(WorldShmComm);
|
|
||||||
WorldShmCommBufs.resize(WorldShmSize);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Each MPI rank should allocate our own buffer
|
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
|
||||||
|
|
||||||
if (ShmCommBuf == (void *)NULL ) {
|
|
||||||
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
|
||||||
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
|
||||||
|
|
||||||
SharedMemoryZero(ShmCommBuf,bytes);
|
|
||||||
|
|
||||||
assert(WorldShmSize == 1);
|
|
||||||
for(int r=0;r<WorldShmSize;r++){
|
|
||||||
WorldShmCommBufs[r] = ShmCommBuf;
|
|
||||||
}
|
|
||||||
_ShmAllocBytes=bytes;
|
|
||||||
_ShmAlloc=1;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)
|
|
||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||||
{
|
{
|
||||||
void * ShmCommBuf ;
|
void * ShmCommBuf ;
|
||||||
@ -574,6 +536,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
HostCommBuf= malloc(bytes);
|
||||||
|
#endif
|
||||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||||
if (ShmCommBuf == (void *)NULL ) {
|
if (ShmCommBuf == (void *)NULL ) {
|
||||||
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||||
@ -738,7 +703,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
_ShmAllocBytes=bytes;
|
_ShmAllocBytes=bytes;
|
||||||
_ShmAlloc=1;
|
_ShmAlloc=1;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#ifdef GRID_MPI3_SHMMMAP
|
#ifdef GRID_MPI3_SHMMMAP
|
||||||
@ -962,6 +926,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
|||||||
}
|
}
|
||||||
ShmBufferFreeAll();
|
ShmBufferFreeAll();
|
||||||
|
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
host_heap_size = heap_size;
|
||||||
|
HostCommBuf= GlobalSharedMemory::HostCommBuf;
|
||||||
|
HostBufferFreeAll();
|
||||||
|
#endif
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
// find comm ranks in our SHM group (i.e. which ranks are on our node)
|
// find comm ranks in our SHM group (i.e. which ranks are on our node)
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
|
@ -35,6 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/lattice/Lattice_transpose.h>
|
#include <Grid/lattice/Lattice_transpose.h>
|
||||||
#include <Grid/lattice/Lattice_local.h>
|
#include <Grid/lattice/Lattice_local.h>
|
||||||
#include <Grid/lattice/Lattice_reduction.h>
|
#include <Grid/lattice/Lattice_reduction.h>
|
||||||
|
#include <Grid/lattice/Lattice_crc.h>
|
||||||
#include <Grid/lattice/Lattice_peekpoke.h>
|
#include <Grid/lattice/Lattice_peekpoke.h>
|
||||||
#include <Grid/lattice/Lattice_reality.h>
|
#include <Grid/lattice/Lattice_reality.h>
|
||||||
#include <Grid/lattice/Lattice_real_imag.h>
|
#include <Grid/lattice/Lattice_real_imag.h>
|
||||||
@ -46,5 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/lattice/Lattice_unary.h>
|
#include <Grid/lattice/Lattice_unary.h>
|
||||||
#include <Grid/lattice/Lattice_transfer.h>
|
#include <Grid/lattice/Lattice_transfer.h>
|
||||||
#include <Grid/lattice/Lattice_basis.h>
|
#include <Grid/lattice/Lattice_basis.h>
|
||||||
#include <Grid/lattice/Lattice_crc.h>
|
|
||||||
#include <Grid/lattice/PaddedCell.h>
|
#include <Grid/lattice/PaddedCell.h>
|
||||||
|
@ -42,13 +42,13 @@ template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> uint32_t crc(Lattice<vobj> & buf)
|
template<class vobj> uint32_t crc(const Lattice<vobj> & buf)
|
||||||
{
|
{
|
||||||
autoView( buf_v , buf, CpuRead);
|
autoView( buf_v , buf, CpuRead);
|
||||||
return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
|
return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
|
#define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -281,11 +281,29 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
|||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
|
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
|
||||||
GridBase *grid = left.Grid();
|
GridBase *grid = left.Grid();
|
||||||
|
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
uint64_t csum=0;
|
||||||
|
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
|
||||||
|
{
|
||||||
|
// Hack
|
||||||
|
// Fast integer xor checksum. Can also be used in comms now.
|
||||||
|
autoView(l_v,left,AcceleratorRead);
|
||||||
|
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
|
||||||
|
uint64_t *base= (uint64_t *)&l_v[0];
|
||||||
|
csum=svm_xor(base,words);
|
||||||
|
}
|
||||||
|
FlightRecorder::CsumLog(csum);
|
||||||
|
#endif
|
||||||
ComplexD nrm = rankInnerProduct(left,right);
|
ComplexD nrm = rankInnerProduct(left,right);
|
||||||
|
RealD local = real(nrm);
|
||||||
|
FlightRecorder::NormLog(real(nrm));
|
||||||
grid->GlobalSum(nrm);
|
grid->GlobalSum(nrm);
|
||||||
|
FlightRecorder::ReductionLog(local,real(nrm));
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -69,29 +69,30 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
||||||
/*
|
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||||
template<class Double> Double svm_reduce(Double *vec,uint64_t L)
|
|
||||||
{
|
{
|
||||||
Double sumResult; zeroit(sumResult);
|
Word xorResult; xorResult = 0;
|
||||||
Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
|
Word *d_sum =(Word *)cl::sycl::malloc_shared(sizeof(Word),*theGridAccelerator);
|
||||||
Double identity; zeroit(identity);
|
Word identity; identity=0;
|
||||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||||
auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
|
auto Reduction = cl::sycl::reduction(d_sum,identity,std::bit_xor<>());
|
||||||
cgh.parallel_for(cl::sycl::range<1>{L},
|
cgh.parallel_for(cl::sycl::range<1>{L},
|
||||||
Reduction,
|
Reduction,
|
||||||
[=] (cl::sycl::id<1> index, auto &sum) {
|
[=] (cl::sycl::id<1> index, auto &sum) {
|
||||||
sum +=vec[index];
|
sum ^=vec[index];
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
theGridAccelerator->wait();
|
theGridAccelerator->wait();
|
||||||
Double ret = d_sum[0];
|
Word ret = d_sum[0];
|
||||||
free(d_sum,*theGridAccelerator);
|
free(d_sum,*theGridAccelerator);
|
||||||
std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
|
@ -411,7 +411,7 @@ public:
|
|||||||
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
|
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
|
||||||
SeedFixedIntegers(seeds);
|
SeedFixedIntegers(seeds);
|
||||||
}
|
}
|
||||||
void SeedFixedIntegers(const std::vector<int> &seeds){
|
void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){
|
||||||
|
|
||||||
// Everyone generates the same seed_seq based on input seeds
|
// Everyone generates the same seed_seq based on input seeds
|
||||||
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
|
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
|
||||||
@ -428,7 +428,6 @@ public:
|
|||||||
// MT implementation does not implement fast discard even though
|
// MT implementation does not implement fast discard even though
|
||||||
// in principle this is possible
|
// in principle this is possible
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
#if 1
|
|
||||||
thread_for( lidx, _grid->lSites(), {
|
thread_for( lidx, _grid->lSites(), {
|
||||||
|
|
||||||
int gidx;
|
int gidx;
|
||||||
@ -449,29 +448,12 @@ public:
|
|||||||
|
|
||||||
int l_idx=generator_idx(o_idx,i_idx);
|
int l_idx=generator_idx(o_idx,i_idx);
|
||||||
_generators[l_idx] = master_engine;
|
_generators[l_idx] = master_engine;
|
||||||
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
|
if ( britney ) {
|
||||||
});
|
Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence
|
||||||
#else
|
} else {
|
||||||
// Everybody loops over global volume.
|
|
||||||
thread_for( gidx, _grid->_gsites, {
|
|
||||||
|
|
||||||
// Where is it?
|
|
||||||
int rank;
|
|
||||||
int o_idx;
|
|
||||||
int i_idx;
|
|
||||||
|
|
||||||
Coordinate gcoor;
|
|
||||||
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
|
|
||||||
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
|
|
||||||
|
|
||||||
// If this is one of mine we take it
|
|
||||||
if( rank == _grid->ThisRank() ){
|
|
||||||
int l_idx=generator_idx(o_idx,i_idx);
|
|
||||||
_generators[l_idx] = master_engine;
|
|
||||||
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
|
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
#endif
|
|
||||||
#else
|
#else
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Machine and thread decomposition dependent seeding is efficient
|
// Machine and thread decomposition dependent seeding is efficient
|
||||||
|
@ -280,20 +280,16 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
|
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
|
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
|
||||||
|
#ifndef GRID_CUDA
|
||||||
if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
|
if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( interior ) {
|
} else if( interior ) {
|
||||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
|
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
|
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
|
||||||
#endif
|
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
|
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
|
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
assert(0 && " Kernel optimisation case not covered ");
|
assert(0 && " Kernel optimisation case not covered ");
|
||||||
}
|
}
|
||||||
@ -322,19 +318,13 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
|
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
|
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
|
||||||
#endif
|
|
||||||
} else if( interior ) {
|
} else if( interior ) {
|
||||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
|
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
|
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
|
||||||
#endif
|
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
|
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
|
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -462,6 +462,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
autoView(st_v , st,AcceleratorRead);
|
autoView(st_v , st,AcceleratorRead);
|
||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
|
acceleratorFenceComputeStream();
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_CUDA
|
||||||
@ -495,6 +496,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
autoView(st_v ,st,AcceleratorRead);
|
autoView(st_v ,st,AcceleratorRead);
|
||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
|
acceleratorFenceComputeStream();
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_CUDA
|
||||||
|
@ -1133,4 +1133,13 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc
|
|||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
template<> struct sycl::is_device_copyable<Grid::vComplexF> : public std::true_type {};
|
||||||
|
template<> struct sycl::is_device_copyable<Grid::vComplexD> : public std::true_type {};
|
||||||
|
template<> struct sycl::is_device_copyable<Grid::vRealF > : public std::true_type {};
|
||||||
|
template<> struct sycl::is_device_copyable<Grid::vRealD > : public std::true_type {};
|
||||||
|
template<> struct sycl::is_device_copyable<Grid::vInteger > : public std::true_type {};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -70,57 +70,6 @@ struct DefaultImplParams {
|
|||||||
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||||
int off,std::vector<std::pair<int,int> > & table);
|
int off,std::vector<std::pair<int,int> > & table);
|
||||||
|
|
||||||
/*
|
|
||||||
template<class vobj,class cobj,class compressor>
|
|
||||||
void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so) __attribute__((noinline));
|
|
||||||
|
|
||||||
template<class vobj,class cobj,class compressor>
|
|
||||||
void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
|
|
||||||
{
|
|
||||||
int num=table.size();
|
|
||||||
std::pair<int,int> *table_v = & table[0];
|
|
||||||
|
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
|
||||||
accelerator_forNB( i,num, vobj::Nsimd(), {
|
|
||||||
compress.Compress(buffer[off+table_v[i].first],rhs_v[so+table_v[i].second]);
|
|
||||||
});
|
|
||||||
rhs_v.ViewClose();
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
|
||||||
// Gather for when there *is* need to SIMD split with compression
|
|
||||||
///////////////////////////////////////////////////////////////////
|
|
||||||
template<class cobj,class vobj,class compressor>
|
|
||||||
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
|
|
||||||
commVector<cobj *> pointers,
|
|
||||||
int dimension,int plane,
|
|
||||||
int cbmask,compressor &compress,int type) __attribute__((noinline));
|
|
||||||
|
|
||||||
template<class cobj,class vobj,class compressor>
|
|
||||||
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
|
|
||||||
const Lattice<vobj> &rhs,
|
|
||||||
std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
|
|
||||||
compressor &compress,int type)
|
|
||||||
{
|
|
||||||
assert( (table.size()&0x1)==0);
|
|
||||||
int num=table.size()/2;
|
|
||||||
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
|
||||||
|
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
|
||||||
auto rhs_p = &rhs_v[0];
|
|
||||||
auto p0=&pointers[0][0];
|
|
||||||
auto p1=&pointers[1][0];
|
|
||||||
auto tp=&table[0];
|
|
||||||
accelerator_forNB(j, num, vobj::Nsimd(), {
|
|
||||||
compress.CompressExchange(p0,p1, rhs_p, j,
|
|
||||||
so+tp[2*j ].second,
|
|
||||||
so+tp[2*j+1].second,
|
|
||||||
type);
|
|
||||||
});
|
|
||||||
rhs_v.ViewClose();
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
void DslashResetCounts(void);
|
void DslashResetCounts(void);
|
||||||
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
|
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
|
||||||
void DslashLogFull(void);
|
void DslashLogFull(void);
|
||||||
@ -258,6 +207,10 @@ public:
|
|||||||
struct Packet {
|
struct Packet {
|
||||||
void * send_buf;
|
void * send_buf;
|
||||||
void * recv_buf;
|
void * recv_buf;
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
|
||||||
|
void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
|
||||||
|
#endif
|
||||||
Integer to_rank;
|
Integer to_rank;
|
||||||
Integer from_rank;
|
Integer from_rank;
|
||||||
Integer do_send;
|
Integer do_send;
|
||||||
@ -324,7 +277,7 @@ public:
|
|||||||
Vector<int> surface_list;
|
Vector<int> surface_list;
|
||||||
|
|
||||||
stencilVector<StencilEntry> _entries; // Resident in managed memory
|
stencilVector<StencilEntry> _entries; // Resident in managed memory
|
||||||
commVector<StencilEntry> _entries_device; // Resident in managed memory
|
commVector<StencilEntry> _entries_device; // Resident in device memory
|
||||||
std::vector<Packet> Packets;
|
std::vector<Packet> Packets;
|
||||||
std::vector<Merge> Mergers;
|
std::vector<Merge> Mergers;
|
||||||
std::vector<Merge> MergersSHM;
|
std::vector<Merge> MergersSHM;
|
||||||
@ -408,33 +361,16 @@ public:
|
|||||||
// Use OpenMP Tasks for cleaner ???
|
// Use OpenMP Tasks for cleaner ???
|
||||||
// must be called *inside* parallel region
|
// must be called *inside* parallel region
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
/*
|
|
||||||
void CommunicateThreaded()
|
|
||||||
{
|
|
||||||
#ifdef GRID_OMP
|
|
||||||
int mythread = omp_get_thread_num();
|
|
||||||
int nthreads = CartesianCommunicator::nCommThreads;
|
|
||||||
#else
|
|
||||||
int mythread = 0;
|
|
||||||
int nthreads = 1;
|
|
||||||
#endif
|
|
||||||
if (nthreads == -1) nthreads = 1;
|
|
||||||
if (mythread < nthreads) {
|
|
||||||
for (int i = mythread; i < Packets.size(); i += nthreads) {
|
|
||||||
uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
|
|
||||||
Packets[i].to_rank,
|
|
||||||
Packets[i].recv_buf,
|
|
||||||
Packets[i].from_rank,
|
|
||||||
Packets[i].bytes,i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Non blocking send and receive. Necessarily parallel.
|
// Non blocking send and receive. Necessarily parallel.
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
|
// All GPU kernel tasks must complete
|
||||||
|
// accelerator_barrier(); // All kernels should ALREADY be complete
|
||||||
|
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
||||||
|
// But the HaloGather had a barrier too.
|
||||||
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,
|
||||||
@ -443,16 +379,54 @@ public:
|
|||||||
Packets[i].from_rank,Packets[i].do_recv,
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
// Introduce a host buffer with a cheap slab allocator and zero cost wipe all
|
||||||
|
Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
|
||||||
|
Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
|
||||||
|
if ( Packets[i].do_send ) {
|
||||||
|
acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
|
||||||
|
}
|
||||||
|
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||||
|
Packets[i].host_send_buf,
|
||||||
|
Packets[i].to_rank,Packets[i].do_send,
|
||||||
|
Packets[i].host_recv_buf,
|
||||||
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
|
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// Get comms started then run checksums
|
||||||
|
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
if ( Packets[i].do_send )
|
||||||
|
FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
_grid->StencilSendToRecvFromComplete(MpiReqs,0);
|
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
||||||
if ( this->partialDirichlet ) DslashLogPartial();
|
if ( this->partialDirichlet ) DslashLogPartial();
|
||||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||||
else DslashLogFull();
|
else DslashLogFull();
|
||||||
acceleratorCopySynchronise();
|
// acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
|
||||||
|
// accelerator_barrier();
|
||||||
_grid->StencilBarrier();
|
_grid->StencilBarrier();
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
if ( Packets[i].do_recv ) {
|
||||||
|
acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_grid->HostBufferFreeAll();
|
||||||
|
#endif
|
||||||
|
// run any checksums
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
if ( Packets[i].do_recv )
|
||||||
|
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Blocking send and receive. Either sequential or parallel.
|
// Blocking send and receive. Either sequential or parallel.
|
||||||
@ -528,6 +502,7 @@ public:
|
|||||||
template<class compressor>
|
template<class compressor>
|
||||||
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
|
// accelerator_barrier();
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
|
|
||||||
assert(source.Grid()==_grid);
|
assert(source.Grid()==_grid);
|
||||||
@ -540,10 +515,9 @@ public:
|
|||||||
compress.Point(point);
|
compress.Point(point);
|
||||||
HaloGatherDir(source,compress,point,face_idx);
|
HaloGatherDir(source,compress,point,face_idx);
|
||||||
}
|
}
|
||||||
accelerator_barrier();
|
accelerator_barrier(); // All my local gathers are complete
|
||||||
face_table_computed=1;
|
face_table_computed=1;
|
||||||
assert(u_comm_offset==_unified_buffer_size);
|
assert(u_comm_offset==_unified_buffer_size);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////
|
/////////////////////////
|
||||||
@ -579,6 +553,7 @@ public:
|
|||||||
accelerator_forNB(j, words, cobj::Nsimd(), {
|
accelerator_forNB(j, words, cobj::Nsimd(), {
|
||||||
coalescedWrite(to[j] ,coalescedRead(from [j]));
|
coalescedWrite(to[j] ,coalescedRead(from [j]));
|
||||||
});
|
});
|
||||||
|
acceleratorFenceComputeStream();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -669,6 +644,7 @@ public:
|
|||||||
for(int i=0;i<dd.size();i++){
|
for(int i=0;i<dd.size();i++){
|
||||||
decompressor::DecompressFace(decompress,dd[i]);
|
decompressor::DecompressFace(decompress,dd[i]);
|
||||||
}
|
}
|
||||||
|
acceleratorFenceComputeStream(); // dependent kernels
|
||||||
}
|
}
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
// Set up routines
|
// Set up routines
|
||||||
@ -1224,7 +1200,6 @@ public:
|
|||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||||
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
|
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
|
||||||
|
|
||||||
AddPacket((void *)&send_buf[comm_off],
|
AddPacket((void *)&send_buf[comm_off],
|
||||||
(void *)&recv_buf[comm_off],
|
(void *)&recv_buf[comm_off],
|
||||||
xmit_to_rank, do_send,
|
xmit_to_rank, do_send,
|
||||||
|
@ -404,3 +404,12 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
};
|
};
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
template<typename T> struct
|
||||||
|
sycl::is_device_copyable<T, typename std::enable_if<
|
||||||
|
Grid::isGridTensor<T>::value && (!std::is_trivially_copyable<T>::value),
|
||||||
|
void>::type>
|
||||||
|
: public std::true_type {};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
@ -255,17 +255,13 @@ inline int acceleratorIsCommunicable(void *ptr)
|
|||||||
#define GRID_SYCL_LEVEL_ZERO_IPC
|
#define GRID_SYCL_LEVEL_ZERO_IPC
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
#if 0
|
|
||||||
#include <CL/sycl.hpp>
|
// Force deterministic reductions
|
||||||
#include <CL/sycl/usm.hpp>
|
#define SYCL_REDUCTION_DETERMINISTIC
|
||||||
#include <level_zero/ze_api.h>
|
|
||||||
#include <CL/sycl/backend/level_zero.hpp>
|
|
||||||
#else
|
|
||||||
#include <sycl/CL/sycl.hpp>
|
#include <sycl/CL/sycl.hpp>
|
||||||
#include <sycl/usm.hpp>
|
#include <sycl/usm.hpp>
|
||||||
#include <level_zero/ze_api.h>
|
#include <level_zero/ze_api.h>
|
||||||
#include <sycl/ext/oneapi/backend/level_zero.hpp>
|
#include <sycl/ext/oneapi/backend/level_zero.hpp>
|
||||||
#endif
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
339
Grid/util/FlightRecorder.cc
Normal file
339
Grid/util/FlightRecorder.cc
Normal file
@ -0,0 +1,339 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/Init.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@MacBook-Pro.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
// Grid Norm logging for repro testing
|
||||||
|
///////////////////////////////////////////////////////
|
||||||
|
int FlightRecorder::PrintEntireLog;
|
||||||
|
int FlightRecorder::ContinueOnFail;
|
||||||
|
int FlightRecorder::LoggingMode;
|
||||||
|
int FlightRecorder::ChecksumComms;
|
||||||
|
int FlightRecorder::ChecksumCommsSend;
|
||||||
|
int32_t FlightRecorder::XmitLoggingCounter;
|
||||||
|
int32_t FlightRecorder::RecvLoggingCounter;
|
||||||
|
int32_t FlightRecorder::CsumLoggingCounter;
|
||||||
|
int32_t FlightRecorder::NormLoggingCounter;
|
||||||
|
int32_t FlightRecorder::ReductionLoggingCounter;
|
||||||
|
uint64_t FlightRecorder::ErrorCounter;
|
||||||
|
std::vector<double> FlightRecorder::NormLogVector;
|
||||||
|
std::vector<double> FlightRecorder::ReductionLogVector;
|
||||||
|
std::vector<uint64_t> FlightRecorder::CsumLogVector;
|
||||||
|
std::vector<uint64_t> FlightRecorder::XmitLogVector;
|
||||||
|
std::vector<uint64_t> FlightRecorder::RecvLogVector;
|
||||||
|
|
||||||
|
void FlightRecorder::ResetCounters(void)
|
||||||
|
{
|
||||||
|
XmitLoggingCounter=0;
|
||||||
|
RecvLoggingCounter=0;
|
||||||
|
CsumLoggingCounter=0;
|
||||||
|
NormLoggingCounter=0;
|
||||||
|
ReductionLoggingCounter=0;
|
||||||
|
}
|
||||||
|
void FlightRecorder::Truncate(void)
|
||||||
|
{
|
||||||
|
ResetCounters();
|
||||||
|
XmitLogVector.resize(0);
|
||||||
|
RecvLogVector.resize(0);
|
||||||
|
NormLogVector.resize(0);
|
||||||
|
CsumLogVector.resize(0);
|
||||||
|
ReductionLogVector.resize(0);
|
||||||
|
}
|
||||||
|
void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
|
||||||
|
{
|
||||||
|
switch ( mode ) {
|
||||||
|
case LoggingModePrint:
|
||||||
|
SetLoggingModePrint();
|
||||||
|
break;
|
||||||
|
case LoggingModeRecord:
|
||||||
|
SetLoggingModeRecord();
|
||||||
|
break;
|
||||||
|
case LoggingModeVerify:
|
||||||
|
SetLoggingModeVerify();
|
||||||
|
break;
|
||||||
|
case LoggingModeNone:
|
||||||
|
LoggingMode = mode;
|
||||||
|
Truncate();
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void FlightRecorder::SetLoggingModePrint(void)
|
||||||
|
{
|
||||||
|
std::cout << " FlightRecorder: set to print output " <<std::endl;
|
||||||
|
Truncate();
|
||||||
|
LoggingMode = LoggingModePrint;
|
||||||
|
}
|
||||||
|
void FlightRecorder::SetLoggingModeRecord(void)
|
||||||
|
{
|
||||||
|
std::cout << " FlightRecorder: set to RECORD " <<std::endl;
|
||||||
|
Truncate();
|
||||||
|
LoggingMode = LoggingModeRecord;
|
||||||
|
}
|
||||||
|
void FlightRecorder::SetLoggingModeVerify(void)
|
||||||
|
{
|
||||||
|
std::cout << " FlightRecorder: set to VERIFY " << NormLogVector.size()<< " log entries "<<std::endl;
|
||||||
|
ResetCounters();
|
||||||
|
LoggingMode = LoggingModeVerify;
|
||||||
|
}
|
||||||
|
uint64_t FlightRecorder::ErrorCount(void)
|
||||||
|
{
|
||||||
|
return ErrorCounter;
|
||||||
|
}
|
||||||
|
void FlightRecorder::NormLog(double value)
|
||||||
|
{
|
||||||
|
uint64_t hex = * ( (uint64_t *)&value );
|
||||||
|
if(LoggingMode == LoggingModePrint) {
|
||||||
|
std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
|
NormLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeRecord) {
|
||||||
|
std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
|
NormLogVector.push_back(value);
|
||||||
|
NormLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
|
|
||||||
|
if(NormLoggingCounter < NormLogVector.size()){
|
||||||
|
uint64_t hexref = * ( (uint64_t *)&NormLogVector[NormLoggingCounter] );
|
||||||
|
|
||||||
|
if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
|
||||||
|
|
||||||
|
std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
|
||||||
|
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
|
||||||
|
<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
|
||||||
|
|
||||||
|
std::cerr << " Oops got norm "<< std::hexfloat<<value<<" expect "<<NormLogVector[NormLoggingCounter] <<std::endl;
|
||||||
|
|
||||||
|
fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for norm %d/%zu %.16e expect %.16e\n",
|
||||||
|
GridHostname(),
|
||||||
|
GlobalSharedMemory::WorldShmRank,
|
||||||
|
NormLoggingCounter,NormLogVector.size(),
|
||||||
|
value, NormLogVector[NormLoggingCounter]); fflush(stderr);
|
||||||
|
|
||||||
|
if(!ContinueOnFail)assert(0); // Force takedown of job
|
||||||
|
|
||||||
|
ErrorCounter++;
|
||||||
|
} else {
|
||||||
|
if ( PrintEntireLog ) {
|
||||||
|
std::cerr<<"FlightRecorder::NormLog VALID "<< NormLoggingCounter << std::hex
|
||||||
|
<<" "<<hex<<" "<<hexref
|
||||||
|
<<" "<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::dec<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
if ( NormLogVector.size()==NormLoggingCounter ) {
|
||||||
|
std::cout << "FlightRecorder:: Verified entire sequence of "<<NormLoggingCounter<<" norms "<<std::endl;
|
||||||
|
}
|
||||||
|
NormLoggingCounter++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void FlightRecorder::CsumLog(uint64_t hex)
|
||||||
|
{
|
||||||
|
if(LoggingMode == LoggingModePrint) {
|
||||||
|
std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
|
CsumLoggingCounter++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(LoggingMode == LoggingModeRecord) {
|
||||||
|
std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
|
CsumLogVector.push_back(hex);
|
||||||
|
CsumLoggingCounter++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
|
|
||||||
|
if(CsumLoggingCounter < CsumLogVector.size()) {
|
||||||
|
|
||||||
|
uint64_t hexref = CsumLogVector[CsumLoggingCounter] ;
|
||||||
|
|
||||||
|
if ( hex != hexref ) {
|
||||||
|
|
||||||
|
std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
|
||||||
|
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
|
||||||
|
|
||||||
|
fprintf(stderr,"%s:%d Oops, I did it again! Reproduce failure for csum %d %lx expect %lx\n",
|
||||||
|
GridHostname(),
|
||||||
|
GlobalSharedMemory::WorldShmRank,
|
||||||
|
CsumLoggingCounter,hex, hexref);
|
||||||
|
fflush(stderr);
|
||||||
|
|
||||||
|
if(!ContinueOnFail) assert(0); // Force takedown of job
|
||||||
|
|
||||||
|
ErrorCounter++;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
if ( PrintEntireLog ) {
|
||||||
|
std::cerr<<"FlightRecorder::CsumLog VALID "<< CsumLoggingCounter << std::hex
|
||||||
|
<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( CsumLogVector.size()==CsumLoggingCounter ) {
|
||||||
|
std::cout << "FlightRecorder:: Verified entire sequence of "<<CsumLoggingCounter<<" checksums "<<std::endl;
|
||||||
|
}
|
||||||
|
CsumLoggingCounter++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void FlightRecorder::ReductionLog(double local,double global)
|
||||||
|
{
|
||||||
|
uint64_t hex_l = * ( (uint64_t *)&local );
|
||||||
|
uint64_t hex_g = * ( (uint64_t *)&global );
|
||||||
|
if(LoggingMode == LoggingModePrint) {
|
||||||
|
std::cerr<<"FlightRecorder::ReductionLog : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
|
||||||
|
ReductionLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeRecord) {
|
||||||
|
std::cerr<<"FlightRecorder::ReductionLog RECORDING : "<< ReductionLoggingCounter <<" "<< std::hex << hex_l << " -> " <<hex_g<<std::dec <<std::endl;
|
||||||
|
ReductionLogVector.push_back(global);
|
||||||
|
ReductionLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
|
if(ReductionLoggingCounter < ReductionLogVector.size()){
|
||||||
|
if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
|
||||||
|
fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
|
||||||
|
GridHostname(),
|
||||||
|
GlobalSharedMemory::WorldShmRank,
|
||||||
|
ReductionLoggingCounter,ReductionLogVector.size(),
|
||||||
|
global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
|
||||||
|
|
||||||
|
if ( !ContinueOnFail ) assert(0);
|
||||||
|
|
||||||
|
ErrorCounter++;
|
||||||
|
} else {
|
||||||
|
if ( PrintEntireLog ) {
|
||||||
|
std::cerr<<"FlightRecorder::ReductionLog : VALID "<< ReductionLoggingCounter <<" "<< std::hexfloat << local << "-> "<< global <<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( ReductionLogVector.size()==ReductionLoggingCounter ) {
|
||||||
|
std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<ReductionLoggingCounter<<" norms "<<std::endl;
|
||||||
|
}
|
||||||
|
ReductionLoggingCounter++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
|
||||||
|
{
|
||||||
|
if ( ChecksumCommsSend ){
|
||||||
|
uint64_t *ubuf = (uint64_t *)buf;
|
||||||
|
if(LoggingMode == LoggingModeNone) return;
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
|
||||||
|
if(LoggingMode == LoggingModePrint) {
|
||||||
|
std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
|
||||||
|
XmitLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeRecord) {
|
||||||
|
std::cerr<<"FlightRecorder::xmitLog RECORD : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
|
||||||
|
XmitLogVector.push_back(_xor);
|
||||||
|
XmitLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
|
if(XmitLoggingCounter < XmitLogVector.size()){
|
||||||
|
if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
|
||||||
|
fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n",
|
||||||
|
GridHostname(),
|
||||||
|
GlobalSharedMemory::WorldShmRank,
|
||||||
|
XmitLoggingCounter,XmitLogVector.size(),
|
||||||
|
_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
|
||||||
|
|
||||||
|
if ( !ContinueOnFail ) assert(0);
|
||||||
|
|
||||||
|
ErrorCounter++;
|
||||||
|
} else {
|
||||||
|
if ( PrintEntireLog ) {
|
||||||
|
std::cerr<<"FlightRecorder::XmitLog : VALID "<< XmitLoggingCounter <<" "<< std::hexfloat << _xor << " "<< XmitLogVector[XmitLoggingCounter] <<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( XmitLogVector.size()==XmitLoggingCounter ) {
|
||||||
|
std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<XmitLoggingCounter<<" sends "<<std::endl;
|
||||||
|
}
|
||||||
|
XmitLoggingCounter++;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
uint64_t word = 1;
|
||||||
|
deviceVector<uint64_t> dev(1);
|
||||||
|
acceleratorCopyToDevice(&word,&dev[0],sizeof(uint64_t));
|
||||||
|
acceleratorCopySynchronise();
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
|
||||||
|
{
|
||||||
|
if ( ChecksumComms ){
|
||||||
|
uint64_t *ubuf = (uint64_t *)buf;
|
||||||
|
if(LoggingMode == LoggingModeNone) return;
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
|
||||||
|
if(LoggingMode == LoggingModePrint) {
|
||||||
|
std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
|
||||||
|
RecvLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeRecord) {
|
||||||
|
std::cerr<<"FlightRecorder::recvLog RECORD : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
|
||||||
|
RecvLogVector.push_back(_xor);
|
||||||
|
RecvLoggingCounter++;
|
||||||
|
}
|
||||||
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
|
if(RecvLoggingCounter < RecvLogVector.size()){
|
||||||
|
if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
|
||||||
|
fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n",
|
||||||
|
GridHostname(),
|
||||||
|
GlobalSharedMemory::WorldShmRank,
|
||||||
|
RecvLoggingCounter,RecvLogVector.size(),
|
||||||
|
_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
|
||||||
|
|
||||||
|
if ( !ContinueOnFail ) assert(0);
|
||||||
|
|
||||||
|
ErrorCounter++;
|
||||||
|
} else {
|
||||||
|
if ( PrintEntireLog ) {
|
||||||
|
std::cerr<<"FlightRecorder::RecvLog : VALID "<< RecvLoggingCounter <<" "<< std::hexfloat << _xor << " "<< RecvLogVector[RecvLoggingCounter] <<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( RecvLogVector.size()==RecvLoggingCounter ) {
|
||||||
|
std::cout << "FlightRecorder::ReductionLog : Verified entire sequence of "<<RecvLoggingCounter<<" sends "<<std::endl;
|
||||||
|
}
|
||||||
|
RecvLoggingCounter++;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
43
Grid/util/FlightRecorder.h
Normal file
43
Grid/util/FlightRecorder.h
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
class FlightRecorder {
|
||||||
|
public:
|
||||||
|
enum LoggingMode_t {
|
||||||
|
LoggingModeNone,
|
||||||
|
LoggingModePrint,
|
||||||
|
LoggingModeRecord,
|
||||||
|
LoggingModeVerify
|
||||||
|
};
|
||||||
|
|
||||||
|
static int LoggingMode;
|
||||||
|
static uint64_t ErrorCounter;
|
||||||
|
static int32_t XmitLoggingCounter;
|
||||||
|
static int32_t RecvLoggingCounter;
|
||||||
|
static int32_t CsumLoggingCounter;
|
||||||
|
static int32_t NormLoggingCounter;
|
||||||
|
static int32_t ReductionLoggingCounter;
|
||||||
|
static std::vector<uint64_t> XmitLogVector;
|
||||||
|
static std::vector<uint64_t> RecvLogVector;
|
||||||
|
static std::vector<uint64_t> CsumLogVector;
|
||||||
|
static std::vector<double> NormLogVector;
|
||||||
|
static std::vector<double> ReductionLogVector;
|
||||||
|
static int ContinueOnFail;
|
||||||
|
static int PrintEntireLog;
|
||||||
|
static int ChecksumComms;
|
||||||
|
static int ChecksumCommsSend;
|
||||||
|
static void SetLoggingModePrint(void);
|
||||||
|
static void SetLoggingModeRecord(void);
|
||||||
|
static void SetLoggingModeVerify(void);
|
||||||
|
static void SetLoggingMode(LoggingMode_t mode);
|
||||||
|
static void NormLog(double value);
|
||||||
|
static void CsumLog(uint64_t csum);
|
||||||
|
static void ReductionLog(double lcl, double glbl);
|
||||||
|
static void Truncate(void);
|
||||||
|
static void ResetCounters(void);
|
||||||
|
static uint64_t ErrorCount(void);
|
||||||
|
static void xmitLog(void *,uint64_t bytes);
|
||||||
|
static void recvLog(void *,uint64_t bytes,int rank);
|
||||||
|
};
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
@ -77,6 +77,10 @@ feenableexcept (unsigned int excepts)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef HOST_NAME_MAX
|
||||||
|
#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
@ -90,7 +94,12 @@ int GridThread::_threads =1;
|
|||||||
int GridThread::_hyperthreads=1;
|
int GridThread::_hyperthreads=1;
|
||||||
int GridThread::_cores=1;
|
int GridThread::_cores=1;
|
||||||
|
|
||||||
|
char hostname[HOST_NAME_MAX+1];
|
||||||
|
|
||||||
|
char *GridHostname(void)
|
||||||
|
{
|
||||||
|
return hostname;
|
||||||
|
}
|
||||||
const Coordinate &GridDefaultLatt(void) {return Grid_default_latt;};
|
const Coordinate &GridDefaultLatt(void) {return Grid_default_latt;};
|
||||||
const Coordinate &GridDefaultMpi(void) {return Grid_default_mpi;};
|
const Coordinate &GridDefaultMpi(void) {return Grid_default_mpi;};
|
||||||
const Coordinate GridDefaultSimd(int dims,int nsimd)
|
const Coordinate GridDefaultSimd(int dims,int nsimd)
|
||||||
@ -393,6 +402,8 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
|
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
|
||||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||||
|
|
||||||
|
gethostname(hostname, HOST_NAME_MAX+1);
|
||||||
|
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
// Reporting
|
// Reporting
|
||||||
|
@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
void Grid_init(int *argc,char ***argv);
|
void Grid_init(int *argc,char ***argv);
|
||||||
void Grid_finalize(void);
|
void Grid_finalize(void);
|
||||||
|
|
||||||
|
char * GridHostname(void);
|
||||||
|
|
||||||
// internal, controled with --handle
|
// internal, controled with --handle
|
||||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
|
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||||
void Grid_debug_handler_init(void);
|
void Grid_debug_handler_init(void);
|
||||||
@ -68,5 +70,6 @@ void GridParseLayout(char **argv,int argc,
|
|||||||
void printHash(void);
|
void printHash(void);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#ifndef GRID_UTIL_H
|
#pragma once
|
||||||
#define GRID_UTIL_H
|
|
||||||
#include <Grid/util/Coordinate.h>
|
#include <Grid/util/Coordinate.h>
|
||||||
#include <Grid/util/Lexicographic.h>
|
#include <Grid/util/Lexicographic.h>
|
||||||
#include <Grid/util/Init.h>
|
#include <Grid/util/Init.h>
|
||||||
#endif
|
#include <Grid/util/FlightRecorder.h>
|
||||||
|
|
||||||
|
@ -65,7 +65,7 @@ struct time_statistics{
|
|||||||
|
|
||||||
void comms_header(){
|
void comms_header(){
|
||||||
std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t"
|
std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t"
|
||||||
<<"bytes\t MB/s uni (err/min/max) \t\t MB/s bidi (err/min/max)"<<std::endl;
|
<<"bytes\t MB/s uni \t\t MB/s bidi "<<std::endl;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct controls {
|
struct controls {
|
||||||
@ -180,10 +180,9 @@ public:
|
|||||||
|
|
||||||
std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
|
std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
|
||||||
<< bytes << " \t "
|
<< bytes << " \t "
|
||||||
<<xbytes/timestat.mean<<" \t "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " \t "
|
<<xbytes/timestat.mean
|
||||||
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
|
<< "\t\t"
|
||||||
<< "\t\t"<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
|
<< bidibytes/timestat.mean<< std::endl;
|
||||||
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
|
|
||||||
fprintf(FP,"%ld, %d, %f\n",(long)bytes,dir,bidibytes/timestat.mean/1000.);
|
fprintf(FP,"%ld, %d, %f\n",(long)bytes,dir,bidibytes/timestat.mean/1000.);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -220,7 +219,7 @@ public:
|
|||||||
uint64_t NN;
|
uint64_t NN;
|
||||||
|
|
||||||
|
|
||||||
uint64_t lmax=32;
|
uint64_t lmax=40;
|
||||||
#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
|
#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
|
||||||
|
|
||||||
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
|
||||||
@ -256,7 +255,7 @@ public:
|
|||||||
<< lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
|
<< lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
|
||||||
<< "\t\t"<< bytes/time/NN <<std::endl;
|
<< "\t\t"<< bytes/time/NN <<std::endl;
|
||||||
|
|
||||||
fprintf(FP,"%ld, %f\n",(long)bytes,bytes/time/NN/1000.);
|
fprintf(FP,"%ld, %f\n",(long)bytes,bytes/time/NN);
|
||||||
|
|
||||||
}
|
}
|
||||||
fprintf(FP,"\n\n");
|
fprintf(FP,"\n\n");
|
||||||
@ -268,64 +267,61 @@ public:
|
|||||||
//int nbasis, int nrhs, int coarseVol
|
//int nbasis, int nrhs, int coarseVol
|
||||||
int basis[] = { 16,32,64 };
|
int basis[] = { 16,32,64 };
|
||||||
int rhs[] = { 8,16,32 };
|
int rhs[] = { 8,16,32 };
|
||||||
int vols[] = { 4*4*4*4, 8*8*8*8, 8*8*16*16 };
|
int vol = 4*4*4*4;
|
||||||
|
|
||||||
GridBLAS blas;
|
GridBLAS blas;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
|
std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / node (coarse mrhs)"<<std::endl;
|
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||||
|
|
||||||
fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
|
fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
|
||||||
|
|
||||||
for(int b=0;b<3;b++){
|
for(int b=0;b<3;b++){
|
||||||
for(int r=0;r<3;r++){
|
for(int r=0;r<3;r++){
|
||||||
for(int v=0;v<3;v++){
|
|
||||||
int M=basis[b];
|
int M=basis[b];
|
||||||
int N=rhs[r];
|
int N=rhs[r];
|
||||||
int K=basis[b];
|
int K=basis[b];
|
||||||
int BATCH=vols[v];
|
int BATCH=vol;
|
||||||
double p=blas.benchmark(M,rhs[r],vols[v],1);
|
double p=blas.benchmark(M,N,K,BATCH);
|
||||||
|
|
||||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
||||||
|
|
||||||
std::cout<<GridLogMessage<<std::setprecision(3)
|
std::cout<<GridLogMessage<<std::setprecision(3)
|
||||||
<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
|
<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
|
||||||
}}}
|
}}
|
||||||
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||||
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / node (block project)"<<std::endl;
|
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block project)"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||||
for(int b=0;b<3;b++){
|
for(int b=0;b<3;b++){
|
||||||
for(int r=0;r<3;r++){
|
for(int r=0;r<3;r++){
|
||||||
for(int v=0;v<2;v++){
|
|
||||||
int M=basis[b];
|
int M=basis[b];
|
||||||
int N=rhs[r];
|
int N=rhs[r];
|
||||||
int K=vols[2];
|
int K=vol;
|
||||||
int BATCH=vols[v];
|
int BATCH=vol;
|
||||||
double p=blas.benchmark(M,rhs[r],vols[v],1);
|
double p=blas.benchmark(M,N,K,BATCH);
|
||||||
|
|
||||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
||||||
std::cout<<GridLogMessage<<std::setprecision(3)
|
std::cout<<GridLogMessage<<std::setprecision(3)
|
||||||
<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
|
<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
|
||||||
}}}
|
}}
|
||||||
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||||
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / node (block promote)"<<std::endl;
|
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block promote)"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
|
||||||
for(int b=0;b<3;b++){
|
for(int b=0;b<3;b++){
|
||||||
for(int r=0;r<3;r++){
|
for(int r=0;r<3;r++){
|
||||||
for(int v=0;v<2;v++){
|
|
||||||
int M=rhs[r];
|
int M=rhs[r];
|
||||||
int N=vols[2];
|
int N=vol;
|
||||||
int K=basis[b];
|
int K=basis[b];
|
||||||
int BATCH=vols[v];
|
int BATCH=vol;
|
||||||
double p=blas.benchmark(M,rhs[r],vols[v],1);
|
double p=blas.benchmark(M,N,K,BATCH);
|
||||||
|
|
||||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
|
||||||
std::cout<<GridLogMessage<<std::setprecision(3)
|
std::cout<<GridLogMessage<<std::setprecision(3)
|
||||||
<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
|
<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
|
||||||
}}}
|
}}
|
||||||
fprintf(FP,"\n\n\n");
|
fprintf(FP,"\n\n\n");
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
};
|
};
|
||||||
@ -458,11 +454,17 @@ public:
|
|||||||
pickCheckerboard(Even,src_e,src);
|
pickCheckerboard(Even,src_e,src);
|
||||||
pickCheckerboard(Odd,src_o,src);
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
const int num_cases = 1;
|
#ifdef AVX512
|
||||||
|
const int num_cases = 3;
|
||||||
|
#else
|
||||||
|
const int num_cases = 2;
|
||||||
|
#endif
|
||||||
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
|
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
|
||||||
|
|
||||||
controls Cases [] = {
|
controls Cases [] = {
|
||||||
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }
|
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
|
||||||
|
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
|
||||||
|
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }
|
||||||
};
|
};
|
||||||
|
|
||||||
for(int c=0;c<num_cases;c++) {
|
for(int c=0;c<num_cases;c++) {
|
||||||
@ -473,6 +475,10 @@ public:
|
|||||||
|
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using ASM WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using UNROLLED WilsonKernels" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||||
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
|
||||||
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
|
|
||||||
@ -618,11 +624,13 @@ public:
|
|||||||
pickCheckerboard(Even,src_e,src);
|
pickCheckerboard(Even,src_e,src);
|
||||||
pickCheckerboard(Odd,src_o,src);
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
const int num_cases = 1;
|
const int num_cases = 2;
|
||||||
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
|
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
|
||||||
|
|
||||||
controls Cases [] = {
|
controls Cases [] = {
|
||||||
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
|
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
|
||||||
|
{ StaggeredKernelsStatic::OptHandUnroll, StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
|
||||||
|
{ StaggeredKernelsStatic::OptInlineAsm , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }
|
||||||
};
|
};
|
||||||
|
|
||||||
for(int c=0;c<num_cases;c++) {
|
for(int c=0;c<num_cases;c++) {
|
||||||
@ -851,11 +859,8 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
||||||
#ifdef KNL
|
|
||||||
LebesgueOrder::Block = std::vector<int>({8,2,2,2});
|
|
||||||
#else
|
|
||||||
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
|
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
|
||||||
#endif
|
|
||||||
Benchmark::Decomposition();
|
Benchmark::Decomposition();
|
||||||
|
|
||||||
int do_su4=0;
|
int do_su4=0;
|
||||||
@ -873,10 +878,10 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
int Ls=1;
|
int Ls=1;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Clover dslash 4D vectorised" <<std::endl;
|
std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
for(int l=0;l<L_list.size();l++){
|
for(int l=0;l<L_list.size();l++){
|
||||||
clover.push_back(Benchmark::Clover(L_list[l]));
|
clover.push_back(Benchmark::DWF(1,L_list[l]));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ls=12;
|
Ls=12;
|
||||||
@ -914,7 +919,7 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if ( do_blas ) {
|
if ( do_blas ) {
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
|
std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
|
||||||
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
|
||||||
@ -942,7 +947,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
|
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
|
||||||
fprintf(FP,"Per node summary table\n");
|
fprintf(FP,"Per node summary table\n");
|
||||||
fprintf(FP,"\n");
|
fprintf(FP,"\n");
|
||||||
fprintf(FP,"L , Wilson, DWF4, Staggered\n");
|
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
|
||||||
fprintf(FP,"\n");
|
fprintf(FP,"\n");
|
||||||
for(int l=0;l<L_list.size();l++){
|
for(int l=0;l<L_list.size();l++){
|
||||||
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
|
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
|
||||||
|
21
configure.ac
21
configure.ac
@ -226,23 +226,14 @@ case ${ac_SFW_FP16} in
|
|||||||
esac
|
esac
|
||||||
|
|
||||||
############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
|
############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
|
||||||
AC_ARG_ENABLE([accelerator-cshift],
|
AC_ARG_ENABLE([accelerator-aware-mpi],
|
||||||
[AS_HELP_STRING([--enable-accelerator-cshift=yes|no],[run cshift on the device])],
|
[AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
|
||||||
[ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes])
|
[ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
|
||||||
|
|
||||||
AC_ARG_ENABLE([ucx-buggy],
|
case ${ac_ACCELERATOR_AWARE_MPI} in
|
||||||
[AS_HELP_STRING([--enable-ucx-buggy=yes|no],[enable workaround for UCX device buffer bugs])],
|
|
||||||
[ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no])
|
|
||||||
|
|
||||||
case ${ac_UCXBUGGY} in
|
|
||||||
yes)
|
yes)
|
||||||
ac_ACC_CSHIFT=no;;
|
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host])
|
||||||
*);;
|
AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
|
||||||
esac
|
|
||||||
|
|
||||||
case ${ac_ACC_CSHIFT} in
|
|
||||||
yes)
|
|
||||||
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);;
|
|
||||||
*);;
|
*);;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
@ -25,12 +25,16 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
|||||||
|
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
export MPICH_OFI_NIC_POLICY=GPU
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
export FI_CXI_CQ_FILL_PERCENT=10
|
||||||
|
export FI_CXI_DEFAULT_CQ_SIZE=262144
|
||||||
|
#export FI_CXI_DEFAULT_CQ_SIZE=131072
|
||||||
|
#export FI_CXI_CQ_FILL_PERCENT=20
|
||||||
|
|
||||||
# 12 ppn, 32 nodes, 384 ranks
|
# 12 ppn, 32 nodes, 384 ranks
|
||||||
#
|
#
|
||||||
@ -45,12 +49,12 @@ CMD="mpiexec -np 12288 -ppn 12 -envall \
|
|||||||
./gpu_tile_compact.sh \
|
./gpu_tile_compact.sh \
|
||||||
./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \
|
./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \
|
||||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||||
$CMD | tee 1024node.dwf.small
|
$CMD | tee 1024node.dwf.small.cq
|
||||||
|
|
||||||
CMD="mpiexec -np 12288 -ppn 12 -envall \
|
CMD="mpiexec -np 12288 -ppn 12 -envall \
|
||||||
./gpu_tile_compact.sh \
|
./gpu_tile_compact.sh \
|
||||||
./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \
|
./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \
|
||||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||||
$CMD | tee 1024node.dwf
|
$CMD | tee 1024node.dwf.cq
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@ source ../sourceme.sh
|
|||||||
export OMP_NUM_THREADS=3
|
export OMP_NUM_THREADS=3
|
||||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
|
||||||
|
|
||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
@ -35,11 +36,25 @@ CMD="mpiexec -np 24 -ppn 12 -envall \
|
|||||||
./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
|
./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
|
||||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||||
|
|
||||||
$CMD
|
#$CMD
|
||||||
|
|
||||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||||
./gpu_tile_compact.sh \
|
./gpu_tile_compact.sh \
|
||||||
./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
|
./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
|
||||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||||
|
|
||||||
|
#$CMD
|
||||||
|
|
||||||
|
CMD="mpiexec -np 1 -ppn 1 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
|
./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
|
||||||
|
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||||
|
|
||||||
|
$CMD
|
||||||
|
|
||||||
|
CMD="mpiexec -np 1 -ppn 1 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
|
./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
|
||||||
|
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||||
|
|
||||||
$CMD
|
$CMD
|
||||||
|
@ -1,16 +1,16 @@
|
|||||||
TOOLS=$HOME/tools
|
|
||||||
../../configure \
|
../../configure \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
--enable-comms=mpi-auto \
|
--enable-comms=mpi-auto \
|
||||||
--enable-accelerator-cshift \
|
|
||||||
--disable-gparity \
|
--disable-gparity \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
--enable-shm=nvlink \
|
--enable-shm=nvlink \
|
||||||
--enable-accelerator=sycl \
|
--enable-accelerator=sycl \
|
||||||
|
--enable-accelerator-aware-mpi=no\
|
||||||
--enable-unified=no \
|
--enable-unified=no \
|
||||||
MPICXX=mpicxx \
|
MPICXX=mpicxx \
|
||||||
CXX=icpx \
|
CXX=icpx \
|
||||||
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
|
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
|
||||||
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
|
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
|
||||||
|
|
||||||
|
2
systems/Aurora/sourceme-sunspot-deterministic.sh
Normal file
2
systems/Aurora/sourceme-sunspot-deterministic.sh
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
@ -3,6 +3,19 @@
|
|||||||
module use /soft/modulefiles
|
module use /soft/modulefiles
|
||||||
module load intel_compute_runtime/release/agama-devel-682.22
|
module load intel_compute_runtime/release/agama-devel-682.22
|
||||||
|
|
||||||
|
export FI_CXI_DEFAULT_CQ_SIZE=131072
|
||||||
|
export FI_CXI_CQ_FILL_PERCENT=20
|
||||||
|
|
||||||
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
|
||||||
|
|
||||||
|
#
|
||||||
|
# -ftarget-register-alloc-mode=pvc:default
|
||||||
|
# -ftarget-register-alloc-mode=pvc:small
|
||||||
|
# -ftarget-register-alloc-mode=pvc:large
|
||||||
|
# -ftarget-register-alloc-mode=pvc:auto
|
||||||
|
#
|
||||||
|
|
||||||
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
|
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
|
||||||
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
|
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
|
||||||
export http_proxy=http://proxy.alcf.anl.gov:3128
|
export http_proxy=http://proxy.alcf.anl.gov:3128
|
||||||
@ -10,3 +23,4 @@ export https_proxy=http://proxy.alcf.anl.gov:3128
|
|||||||
#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
||||||
git config --global http.proxy http://proxy.alcf.anl.gov:3128
|
git config --global http.proxy http://proxy.alcf.anl.gov:3128
|
||||||
|
|
||||||
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
41
systems/Aurora/tests/repro128.pbs
Normal file
41
systems/Aurora/tests/repro128.pbs
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||||
|
|
||||||
|
#PBS -q EarlyAppAccess
|
||||||
|
#PBS -l select=128
|
||||||
|
#PBS -l walltime=02:00:00
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
source ../sourceme.sh
|
||||||
|
|
||||||
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=3
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
|
||||||
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
|
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
# 12 ppn, 16 nodes, 192 ranks
|
||||||
|
# 12 ppn, 128 nodes, 1536 ranks
|
||||||
|
CMD="mpiexec -np 1536 -ppn 12 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
|
./Test_dwf_mixedcg_prec --mpi 4.4.4.24 --grid 128.128.128.384 \
|
||||||
|
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 7000 --comms-overlap "
|
||||||
|
$CMD
|
61
systems/Aurora/tests/repro16.pbs
Normal file
61
systems/Aurora/tests/repro16.pbs
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||||
|
|
||||||
|
#PBS -l select=16:system=sunspot,place=scatter
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
#PBS -l walltime=01:00:00
|
||||||
|
#PBS -N dwf
|
||||||
|
#PBS -k doe
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
#source ../sourceme.sh
|
||||||
|
|
||||||
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
#export MPICH_COLL_SYNC=1
|
||||||
|
#export MPICH_ENV_DISPLAY=1
|
||||||
|
export MPICH_
|
||||||
|
export OMP_NUM_THREADS=3
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
||||||
|
#export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
|
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
||||||
|
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
DIR=repro.$PBS_JOBID
|
||||||
|
mkdir $DIR
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
CMD="mpiexec -np 192 -ppn 12 -envall \
|
||||||
|
../gpu_tile_compact.sh \
|
||||||
|
../Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
|
||||||
|
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --debug-stdout --log Message,Iterative"
|
||||||
|
#--comms-overlap
|
||||||
|
$CMD
|
||||||
|
|
||||||
|
grep Oops Grid.stderr.* > failures.$PBS_JOBID
|
||||||
|
rm core.*
|
||||||
|
|
82
systems/Aurora/tests/repro1gpu.pbs
Normal file
82
systems/Aurora/tests/repro1gpu.pbs
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#PBS -l select=16:system=sunspot,place=scatter
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
#PBS -l walltime=02:00:00
|
||||||
|
#PBS -N repro1gpu
|
||||||
|
#PBS -k doe
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
||||||
|
|
||||||
|
# 56 cores / 6 threads ~9
|
||||||
|
export OMP_NUM_THREADS=6
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
||||||
|
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
NN=`cat $PBS_NODEFILE | wc -l`
|
||||||
|
echo $PBS_NODEFILE
|
||||||
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
echo $NN nodes in node file
|
||||||
|
for n in `eval echo {1..$NN}`
|
||||||
|
do
|
||||||
|
|
||||||
|
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
|
||||||
|
echo Node $n is $THIS_NODE
|
||||||
|
|
||||||
|
|
||||||
|
for g in {0..11}
|
||||||
|
do
|
||||||
|
export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
|
||||||
|
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
|
||||||
|
export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
|
||||||
|
|
||||||
|
export numa=${NUMA_MAP[$g]}
|
||||||
|
export gpu_id=${GPU_MAP[$g]}
|
||||||
|
export tile_id=${TILE_MAP[$g]}
|
||||||
|
export gpu=$gpu_id.$tile_id
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu
|
||||||
|
mkdir -p $DIR
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
echo $THIS_NODE > nodefile
|
||||||
|
echo $gpu > gpu
|
||||||
|
|
||||||
|
export ZE_AFFINITY_MASK=$gpu
|
||||||
|
export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
||||||
|
|
||||||
|
CMD="mpiexec -np 1 -ppn 1 -envall --hostfile nodefile \
|
||||||
|
numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \
|
||||||
|
--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message"
|
||||||
|
echo $CMD
|
||||||
|
$CMD &
|
||||||
|
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
wait
|
||||||
|
|
98
systems/Aurora/tests/reproN.pbs
Normal file
98
systems/Aurora/tests/reproN.pbs
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#PBS -l select=32:system=sunspot,place=scatter
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
#PBS -l walltime=02:00:00
|
||||||
|
#PBS -N reproN
|
||||||
|
#PBS -k doe
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
||||||
|
|
||||||
|
# 56 cores / 6 threads ~9
|
||||||
|
export OMP_NUM_THREADS=6
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||||
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
|
||||||
|
export GRID_PRINT_ENTIRE_LOG=0
|
||||||
|
export GRID_CHECKSUM_RECV_BUF=0
|
||||||
|
export GRID_CHECKSUM_SEND_BUF=0
|
||||||
|
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
||||||
|
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
NN=`cat $PBS_NODEFILE | wc -l`
|
||||||
|
echo $PBS_NODEFILE
|
||||||
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
echo $NN nodes in node file
|
||||||
|
for n in `eval echo {1..$NN}`
|
||||||
|
do
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
|
||||||
|
echo Node $n is $THIS_NODE
|
||||||
|
|
||||||
|
DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
|
||||||
|
|
||||||
|
mkdir -p $DIR
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
echo $THIS_NODE > nodefile
|
||||||
|
|
||||||
|
#CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \
|
||||||
|
# ../../gpu_tile_compact.sh \
|
||||||
|
# ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
|
||||||
|
# --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
|
||||||
|
|
||||||
|
CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \
|
||||||
|
../../gpu_tile_compact.sh \
|
||||||
|
../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
|
||||||
|
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
|
||||||
|
|
||||||
|
echo $CMD > command-line
|
||||||
|
env > environment
|
||||||
|
$CMD &
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# Suspicious wait is allowing jobs to collide and knock out
|
||||||
|
#wait
|
||||||
|
|
||||||
|
sleep 6500
|
||||||
|
|
||||||
|
for n in ` eval echo {1..$NN} `
|
||||||
|
do
|
||||||
|
|
||||||
|
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
|
||||||
|
DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
|
||||||
|
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
grep Oops Grid.stderr.* > failures.$PBS_JOBID
|
||||||
|
rm core.*
|
||||||
|
|
||||||
|
done
|
40
systems/Aurora/tests/solver/stag16.pbs
Normal file
40
systems/Aurora/tests/solver/stag16.pbs
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||||
|
|
||||||
|
#PBS -q EarlyAppAccess
|
||||||
|
#PBS -l select=16
|
||||||
|
#PBS -l walltime=01:00:00
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
source ../../sourceme.sh
|
||||||
|
|
||||||
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
export OMP_NUM_THREADS=3
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
|
||||||
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
|
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
# 12 ppn, 16 nodes, 192 ranks
|
||||||
|
CMD="mpiexec -np 192 -ppn 12 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
|
./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
|
||||||
|
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000 --comms-overlap"
|
||||||
|
$CMD
|
70
systems/Booster/benchmarks/Benchmark_usqcd.csv
Normal file
70
systems/Booster/benchmarks/Benchmark_usqcd.csv
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
Memory Bandwidth
|
||||||
|
|
||||||
|
Bytes, GB/s per node
|
||||||
|
3145728, 225.900365
|
||||||
|
50331648, 2858.859504
|
||||||
|
254803968, 4145.556367
|
||||||
|
805306368, 4905.772480
|
||||||
|
1966080000, 4978.312557
|
||||||
|
|
||||||
|
|
||||||
|
GEMM
|
||||||
|
|
||||||
|
M, N, K, BATCH, GF/s per rank
|
||||||
|
16, 8, 16, 256, 1.713639
|
||||||
|
16, 16, 16, 256, 288.268316
|
||||||
|
16, 32, 16, 256, 597.053950
|
||||||
|
32, 8, 32, 256, 557.382591
|
||||||
|
32, 16, 32, 256, 1100.145311
|
||||||
|
32, 32, 32, 256, 1885.080449
|
||||||
|
64, 8, 64, 256, 1725.163599
|
||||||
|
64, 16, 64, 256, 3389.336566
|
||||||
|
64, 32, 64, 256, 4168.252422
|
||||||
|
16, 8, 256, 256, 1326.262134
|
||||||
|
16, 16, 256, 256, 2318.095475
|
||||||
|
16, 32, 256, 256, 3555.436503
|
||||||
|
32, 8, 256, 256, 1920.139170
|
||||||
|
32, 16, 256, 256, 3486.174753
|
||||||
|
32, 32, 256, 256, 5320.821724
|
||||||
|
64, 8, 256, 256, 2539.597502
|
||||||
|
64, 16, 256, 256, 5003.456775
|
||||||
|
64, 32, 256, 256, 7837.531562
|
||||||
|
8, 256, 16, 256, 1427.848170
|
||||||
|
16, 256, 16, 256, 2222.147815
|
||||||
|
32, 256, 16, 256, 2877.121715
|
||||||
|
8, 256, 32, 256, 1922.890086
|
||||||
|
16, 256, 32, 256, 3199.469082
|
||||||
|
32, 256, 32, 256, 4845.405343
|
||||||
|
8, 256, 64, 256, 2639.483343
|
||||||
|
16, 256, 64, 256, 5012.800299
|
||||||
|
32, 256, 64, 256, 7216.006882
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Communications
|
||||||
|
|
||||||
|
Packet bytes, direction, GB/s per node
|
||||||
|
4718592, 2, 206.570734
|
||||||
|
4718592, 3, 207.501847
|
||||||
|
4718592, 6, 189.730277
|
||||||
|
4718592, 7, 204.301218
|
||||||
|
15925248, 2, 307.882997
|
||||||
|
15925248, 3, 287.901076
|
||||||
|
15925248, 6, 295.603109
|
||||||
|
15925248, 7, 300.682033
|
||||||
|
37748736, 2, 331.740364
|
||||||
|
37748736, 3, 338.610627
|
||||||
|
37748736, 6, 332.580657
|
||||||
|
37748736, 7, 336.336579
|
||||||
|
|
||||||
|
|
||||||
|
Per node summary table
|
||||||
|
|
||||||
|
L , Wilson, DWF4, Staggered, GF/s per node
|
||||||
|
|
||||||
|
8 , 16, 1165, 10
|
||||||
|
12 , 473, 4901, 163
|
||||||
|
16 , 1436, 8464, 442
|
||||||
|
24 , 4133, 10139, 1530
|
||||||
|
32 , 5726, 11487, 2518
|
||||||
|
|
|
@ -5,10 +5,12 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
|
|||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
--enable-shm=nvlink \
|
--enable-shm=nvlink \
|
||||||
--enable-accelerator=cuda \
|
--enable-accelerator=cuda \
|
||||||
|
--disable-gparity \
|
||||||
|
--disable-fermion-reps \
|
||||||
--with-lime=$LIME \
|
--with-lime=$LIME \
|
||||||
--disable-accelerator-cshift \
|
--enable-accelerator-cshift \
|
||||||
--disable-unified \
|
--disable-unified \
|
||||||
CXX=nvcc \
|
CXX=nvcc \
|
||||||
LDFLAGS="-cudart shared " \
|
LDFLAGS="-cudart shared " \
|
||||||
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"
|
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared -lcublas"
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
module load GCC/9.3.0
|
module load GCC
|
||||||
module load GMP/6.2.0
|
module load GMP
|
||||||
module load MPFR/4.1.0
|
module load MPFR
|
||||||
module load OpenMPI/4.1.0rc1
|
module load OpenMPI
|
||||||
module load CUDA/11.3
|
module load CUDA
|
||||||
|
@ -16,7 +16,7 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
|||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
CXX=hipcc MPICXX=mpicxx \
|
CXX=hipcc MPICXX=mpicxx \
|
||||||
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -fgpu-sanitize" \
|
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -fgpu-sanitize" \
|
||||||
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "
|
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
export https_proxy=http://proxy-chain.intel.com:911
|
export https_proxy=http://proxy-chain.intel.com:911
|
||||||
module load intel-release
|
module load intel-release
|
||||||
module load intel/mpich
|
module load intel/mpich
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
TOOLS=$HOME/tools
|
|
||||||
../../configure \
|
../../configure \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
@ -11,6 +11,6 @@ TOOLS=$HOME/tools
|
|||||||
--enable-unified=no \
|
--enable-unified=no \
|
||||||
MPICXX=mpicxx \
|
MPICXX=mpicxx \
|
||||||
CXX=icpx \
|
CXX=icpx \
|
||||||
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
|
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl" \
|
||||||
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"
|
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel"
|
||||||
|
|
||||||
|
2
systems/Sunspot/sourceme.sh
Normal file
2
systems/Sunspot/sourceme.sh
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
81
systems/Sunspot/tests/repro1gpu.pbs
Normal file
81
systems/Sunspot/tests/repro1gpu.pbs
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#PBS -l select=16:system=sunspot,place=scatter
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
#PBS -l walltime=02:00:00
|
||||||
|
#PBS -N repro1gpu
|
||||||
|
#PBS -k doe
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
||||||
|
|
||||||
|
# 56 cores / 6 threads ~9
|
||||||
|
export OMP_NUM_THREADS=6
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
||||||
|
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
NN=`cat $PBS_NODEFILE | wc -l`
|
||||||
|
echo $PBS_NODEFILE
|
||||||
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
echo $NN nodes in node file
|
||||||
|
for n in `eval echo {1..$NN}`
|
||||||
|
do
|
||||||
|
|
||||||
|
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
|
||||||
|
echo Node $n is $THIS_NODE
|
||||||
|
|
||||||
|
|
||||||
|
for g in {0..11}
|
||||||
|
do
|
||||||
|
export NUMA_MAP=(0 0 0 1 1 1 0 0 0 1 1 1 )
|
||||||
|
export TILE_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 )
|
||||||
|
export GPU_MAP=(0 1 2 3 4 5 0 1 2 3 4 5 )
|
||||||
|
|
||||||
|
export numa=${NUMA_MAP[$g]}
|
||||||
|
export gpu_id=${GPU_MAP[$g]}
|
||||||
|
export tile_id=${TILE_MAP[$g]}
|
||||||
|
export gpu=$gpu_id.$tile_id
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
DIR=repro.1gpu.$PBS_JOBID/node-$n-$THIS_NODE-GPU-$gpu
|
||||||
|
mkdir -p $DIR
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
echo $THIS_NODE > nodefile
|
||||||
|
echo $gpu > gpu
|
||||||
|
|
||||||
|
export ZE_AFFINITY_MASK=$gpu
|
||||||
|
export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
||||||
|
|
||||||
|
CMD="mpiexec -np 1 -ppn 1 -envall --hostfile nodefile \
|
||||||
|
numactl -N $numa -m $numa ../../Test_dwf_mixedcg_prec --mpi 1.1.1.1 --grid 16.16.32.32 \
|
||||||
|
--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message"
|
||||||
|
echo $CMD
|
||||||
|
$CMD &
|
||||||
|
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
wait
|
||||||
|
|
97
systems/Sunspot/tests/reproN.pbs
Normal file
97
systems/Sunspot/tests/reproN.pbs
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#PBS -l select=32:system=sunspot,place=scatter
|
||||||
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
#PBS -l walltime=02:00:00
|
||||||
|
#PBS -N reproN
|
||||||
|
#PBS -k doe
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
||||||
|
|
||||||
|
# 56 cores / 6 threads ~9
|
||||||
|
export OMP_NUM_THREADS=6
|
||||||
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
|
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
|
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
|
||||||
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||||
|
|
||||||
|
export GRID_PRINT_ENTIRE_LOG=0
|
||||||
|
export GRID_CHECKSUM_RECV_BUF=1
|
||||||
|
export GRID_CHECKSUM_SEND_BUF=0
|
||||||
|
|
||||||
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
||||||
|
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
||||||
|
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
NN=`cat $PBS_NODEFILE | wc -l`
|
||||||
|
echo $PBS_NODEFILE
|
||||||
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
echo $NN nodes in node file
|
||||||
|
for n in `eval echo {1..$NN}`
|
||||||
|
do
|
||||||
|
|
||||||
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
|
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
|
||||||
|
echo Node $n is $THIS_NODE
|
||||||
|
|
||||||
|
DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
|
||||||
|
|
||||||
|
mkdir -p $DIR
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
echo $THIS_NODE > nodefile
|
||||||
|
|
||||||
|
#CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \
|
||||||
|
# ../../gpu_tile_compact.sh \
|
||||||
|
# ../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
|
||||||
|
# --shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
|
||||||
|
|
||||||
|
CMD="mpiexec -np 12 -ppn 12 -envall --hostfile nodefile \
|
||||||
|
../../gpu_tile_compact.sh \
|
||||||
|
../../Test_dwf_mixedcg_prec --mpi 1.2.2.3 --grid 32.64.64.96 \
|
||||||
|
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 6000 --debug-stdout --log Message --comms-overlap"
|
||||||
|
|
||||||
|
echo $CMD > command-line
|
||||||
|
env > environment
|
||||||
|
$CMD &
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# Suspicious wait is allowing jobs to collide and knock out
|
||||||
|
#wait
|
||||||
|
|
||||||
|
sleep 6500
|
||||||
|
|
||||||
|
for n in ` eval echo {1..$NN} `
|
||||||
|
do
|
||||||
|
|
||||||
|
THIS_NODE=`head -n$n $PBS_NODEFILE | tail -n1 `
|
||||||
|
DIR=reproN.$PBS_JOBID/node-$n-$THIS_NODE
|
||||||
|
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
grep Oops Grid.stderr.* > failures.$PBS_JOBID
|
||||||
|
rm core.*
|
||||||
|
|
||||||
|
done
|
@ -30,27 +30,60 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
|
|
||||||
template<class d>
|
#ifndef HOST_NAME_MAX
|
||||||
struct scal {
|
#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
|
||||||
d internal;
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
template<class Matrix,class Field>
|
||||||
|
class SchurDiagMooeeOperatorParanoid : public SchurOperatorBase<Field> {
|
||||||
|
public:
|
||||||
|
Matrix &_Mat;
|
||||||
|
SchurDiagMooeeOperatorParanoid (Matrix &Mat): _Mat(Mat){};
|
||||||
|
virtual void Mpc (const Field &in, Field &out) {
|
||||||
|
Field tmp(in.Grid());
|
||||||
|
tmp.Checkerboard() = !in.Checkerboard();
|
||||||
|
// std::cout <<" Mpc starting"<<std::endl;
|
||||||
|
|
||||||
|
RealD nn = norm2(in); // std::cout <<" Mpc Prior to dslash norm is "<<nn<<std::endl;
|
||||||
|
_Mat.Meooe(in,tmp);
|
||||||
|
nn = norm2(tmp); //std::cout <<" Mpc Prior to Mooeinv "<<nn<<std::endl;
|
||||||
|
_Mat.MooeeInv(tmp,out);
|
||||||
|
nn = norm2(out); //std::cout <<" Mpc Prior to dslash norm is "<<nn<<std::endl;
|
||||||
|
_Mat.Meooe(out,tmp);
|
||||||
|
nn = norm2(tmp); //std::cout <<" Mpc Prior to Mooee "<<nn<<std::endl;
|
||||||
|
_Mat.Mooee(in,out);
|
||||||
|
nn = norm2(out); //std::cout <<" Mpc Prior to axpy "<<nn<<std::endl;
|
||||||
|
axpy(out,-1.0,tmp,out);
|
||||||
|
}
|
||||||
|
virtual void MpcDag (const Field &in, Field &out){
|
||||||
|
Field tmp(in.Grid());
|
||||||
|
// std::cout <<" MpcDag starting"<<std::endl;
|
||||||
|
RealD nn = norm2(in);// std::cout <<" MpcDag Prior to dslash norm is "<<nn<<std::endl;
|
||||||
|
_Mat.MeooeDag(in,tmp);
|
||||||
|
_Mat.MooeeInvDag(tmp,out);
|
||||||
|
nn = norm2(out);// std::cout <<" MpcDag Prior to dslash norm is "<<nn<<std::endl;
|
||||||
|
_Mat.MeooeDag(out,tmp);
|
||||||
|
nn = norm2(tmp);// std::cout <<" MpcDag Prior to Mooee "<<nn<<std::endl;
|
||||||
|
_Mat.MooeeDag(in,out);
|
||||||
|
nn = norm2(out);// std::cout <<" MpcDag Prior to axpy "<<nn<<std::endl;
|
||||||
|
axpy(out,-1.0,tmp,out);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Gamma::Algebra Gmu [] = {
|
NAMESPACE_END(Grid);
|
||||||
Gamma::Algebra::GammaX,
|
|
||||||
Gamma::Algebra::GammaY,
|
|
||||||
Gamma::Algebra::GammaZ,
|
|
||||||
Gamma::Algebra::GammaT
|
|
||||||
};
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
|
char hostname[HOST_NAME_MAX+1];
|
||||||
|
gethostname(hostname, HOST_NAME_MAX+1);
|
||||||
|
std::string host(hostname);
|
||||||
|
|
||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
const int Ls=12;
|
const int Ls=12;
|
||||||
|
|
||||||
std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
|
|
||||||
|
|
||||||
{
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
@ -89,10 +122,17 @@ int main (int argc, char ** argv)
|
|||||||
result_o_2.Checkerboard() = Odd;
|
result_o_2.Checkerboard() = Odd;
|
||||||
result_o_2 = Zero();
|
result_o_2 = Zero();
|
||||||
|
|
||||||
SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
|
SchurDiagMooeeOperatorParanoid<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
|
||||||
SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
|
SchurDiagMooeeOperatorParanoid<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
|
||||||
|
|
||||||
|
int nsecs=600;
|
||||||
|
if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
|
||||||
|
std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds");
|
||||||
|
GridCmdOptionInt(arg,nsecs);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl;
|
||||||
|
|
||||||
std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
|
|
||||||
MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
|
MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
|
||||||
double t1,t2,flops;
|
double t1,t2,flops;
|
||||||
double MdagMsiteflops = 1452; // Mobius (real coeffs)
|
double MdagMsiteflops = 1452; // Mobius (real coeffs)
|
||||||
@ -101,7 +141,26 @@ int main (int argc, char ** argv)
|
|||||||
std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
|
std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
|
||||||
std:: cout << " CG site flops = "<< CGsiteflops <<std::endl;
|
std:: cout << " CG site flops = "<< CGsiteflops <<std::endl;
|
||||||
int iters;
|
int iters;
|
||||||
for(int i=0;i<10;i++){
|
|
||||||
|
time_t start = time(NULL);
|
||||||
|
|
||||||
|
FlightRecorder::ContinueOnFail = 0;
|
||||||
|
FlightRecorder::PrintEntireLog = 0;
|
||||||
|
FlightRecorder::ChecksumComms = 1;
|
||||||
|
FlightRecorder::ChecksumCommsSend=0;
|
||||||
|
|
||||||
|
if(char *s=getenv("GRID_PRINT_ENTIRE_LOG")) FlightRecorder::PrintEntireLog = atoi(s);
|
||||||
|
if(char *s=getenv("GRID_CHECKSUM_RECV_BUF")) FlightRecorder::ChecksumComms = atoi(s);
|
||||||
|
if(char *s=getenv("GRID_CHECKSUM_SEND_BUF")) FlightRecorder::ChecksumCommsSend = atoi(s);
|
||||||
|
|
||||||
|
int iter=0;
|
||||||
|
do {
|
||||||
|
if ( iter == 0 ) {
|
||||||
|
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
|
||||||
|
} else {
|
||||||
|
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify);
|
||||||
|
}
|
||||||
|
std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
|
||||||
result_o = Zero();
|
result_o = Zero();
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
mCG(src_o,result_o);
|
mCG(src_o,result_o);
|
||||||
@ -111,10 +170,24 @@ int main (int argc, char ** argv)
|
|||||||
flops+= CGsiteflops*FrbGrid->gSites()*iters;
|
flops+= CGsiteflops*FrbGrid->gSites()*iters;
|
||||||
std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
|
std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
|
||||||
std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
|
std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
|
||||||
}
|
std::cout << " SinglePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
|
||||||
std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
|
|
||||||
|
assert(FlightRecorder::ErrorCount()==0);
|
||||||
|
|
||||||
|
std::cout << " FlightRecorder is OK! "<<std::endl;
|
||||||
|
iter ++;
|
||||||
|
} while (time(NULL) < (start + nsecs/10) );
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
|
||||||
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
|
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
|
||||||
for(int i=0;i<1;i++){
|
int i=0;
|
||||||
|
do {
|
||||||
|
if ( i == 0 ) {
|
||||||
|
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
|
||||||
|
} else {
|
||||||
|
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeVerify);
|
||||||
|
}
|
||||||
|
std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
|
||||||
result_o_2 = Zero();
|
result_o_2 = Zero();
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
CG(HermOpEO,src_o,result_o_2);
|
CG(HermOpEO,src_o,result_o_2);
|
||||||
@ -125,43 +198,17 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
|
std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
|
||||||
std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
|
std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
|
||||||
}
|
std::cout << " DoublePrecision error count "<< FlightRecorder::ErrorCount()<<std::endl;
|
||||||
|
assert(FlightRecorder::ErrorCount()==0);
|
||||||
// MemoryManager::Print();
|
std::cout << " FlightRecorder is OK! "<<std::endl;
|
||||||
|
i++;
|
||||||
|
} while (time(NULL) < (start + nsecs) );
|
||||||
|
|
||||||
LatticeFermionD diff_o(FrbGrid);
|
LatticeFermionD diff_o(FrbGrid);
|
||||||
RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
|
RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
|
||||||
|
|
||||||
std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
|
std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
|
||||||
|
assert(diff < 1e-4);
|
||||||
#ifdef HAVE_LIME
|
|
||||||
if( GridCmdOptionExists(argv,argv+argc,"--checksums") ){
|
|
||||||
|
|
||||||
std::string file1("./Propagator1");
|
|
||||||
emptyUserRecord record;
|
|
||||||
uint32_t nersc_csum;
|
|
||||||
uint32_t scidac_csuma;
|
|
||||||
uint32_t scidac_csumb;
|
|
||||||
typedef SpinColourVectorD FermionD;
|
|
||||||
typedef vSpinColourVectorD vFermionD;
|
|
||||||
|
|
||||||
BinarySimpleMunger<FermionD,FermionD> munge;
|
|
||||||
std::string format = getFormatString<vFermionD>();
|
|
||||||
|
|
||||||
BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o,file1,munge, 0, format,
|
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << " Mixed checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
|
|
||||||
|
|
||||||
BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o_2,file1,munge, 0, format,
|
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
MemoryManager::Print();
|
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user