1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-20 16:56:55 +01:00

Compare commits

..

1 Commits

Author SHA1 Message Date
f5bb378575 Merge 6cd2d8fcd5 into 73c0b29535 2024-02-26 09:55:26 -05:00
47 changed files with 253 additions and 3106 deletions

4
.gitignore vendored
View File

@ -1,7 +1,3 @@
# Doxygen stuff
html/*
latex/*
# Compiled Object files #
#########################
*.slo

View File

@ -34,7 +34,7 @@
#pragma push_macro("__SYCL_DEVICE_ONLY__")
#undef __SYCL_DEVICE_ONLY__
#define EIGEN_DONT_VECTORIZE
#undef EIGEN_USE_SYCL
//#undef EIGEN_USE_SYCL
#define __SYCL__REDEFINE__
#endif

View File

@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
* Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
* type = 1 for the approximation which is infinite at x = 0. */
zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
l, invlambda, xi, xisq, *tv, s, opl;
int m, czero, ts;
@ -375,12 +375,12 @@ zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
construct_partfrac(d);
construct_contfrac(d);
/* Converting everything to ZOLO_PRECISION for external use only */
/* Converting everything to PRECISION for external use only */
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
zd -> A = (ZOLO_PRECISION) d -> A;
zd -> Delta = (ZOLO_PRECISION) d -> Delta;
zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
zd -> A = (PRECISION) d -> A;
zd -> Delta = (PRECISION) d -> Delta;
zd -> epsilon = (PRECISION) d -> epsilon;
zd -> n = d -> n;
zd -> type = d -> type;
zd -> dn = d -> dn;
@ -390,24 +390,24 @@ zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
zd -> deg_num = d -> deg_num;
zd -> deg_denom = d -> deg_denom;
zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
free(d -> a);
zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
free(d -> ap);
zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
free(d -> alpha);
zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
free(d -> beta);
zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
free(d -> gamma);
free(d);
@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
}
zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
zolotarev_data* higham(PRECISION epsilon, int n) {
INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
int m, czero;
zolotarev_data *zd;
@ -481,9 +481,9 @@ zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
/* Converting everything to PRECISION for external use only */
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
zd -> A = (ZOLO_PRECISION) d -> A;
zd -> Delta = (ZOLO_PRECISION) d -> Delta;
zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
zd -> A = (PRECISION) d -> A;
zd -> Delta = (PRECISION) d -> Delta;
zd -> epsilon = (PRECISION) d -> epsilon;
zd -> n = d -> n;
zd -> type = d -> type;
zd -> dn = d -> dn;
@ -493,24 +493,24 @@ zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
zd -> deg_num = d -> deg_num;
zd -> deg_denom = d -> deg_denom;
zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
free(d -> a);
zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
free(d -> ap);
zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
free(d -> alpha);
zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
free(d -> beta);
zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
free(d -> gamma);
free(d);
@ -523,17 +523,17 @@ NAMESPACE_END(Grid);
#ifdef TEST
#undef ZERO
#define ZERO ((ZOLO_PRECISION) 0)
#define ZERO ((PRECISION) 0)
#undef ONE
#define ONE ((ZOLO_PRECISION) 1)
#define ONE ((PRECISION) 1)
#undef TWO
#define TWO ((ZOLO_PRECISION) 2)
#define TWO ((PRECISION) 2)
/* Evaluate the rational approximation R(x) using the factored form */
static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
int m;
ZOLO_PRECISION R;
PRECISION R;
if (rdata -> type == 0) {
R = rdata -> A * x;
@ -551,9 +551,9 @@ static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
/* Evaluate the rational approximation R(x) using the partial fraction form */
static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
int m;
ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
PRECISION R = rdata -> alpha[rdata -> da - 1];
for (m = 0; m < rdata -> dd; m++)
R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
@ -568,18 +568,18 @@ static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data*
* non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
* but with signalling overflow you will get an error message. */
static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
int m;
ZOLO_PRECISION R = rdata -> beta[0] * x;
PRECISION R = rdata -> beta[0] * x;
for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
return R;
}
/* Evaluate the rational approximation R(x) using Cayley form */
static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
int m;
ZOLO_PRECISION T;
PRECISION T;
T = rdata -> type == 0 ? ONE : -ONE;
for (m = 0; m < rdata -> n; m++)
@ -607,7 +607,7 @@ int main(int argc, char** argv) {
int m, n, plotpts = 5000, type = 0;
float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
zolotarev_data *rdata;
ZOLO_PRECISION y;
PRECISION y;
FILE *plot_function, *plot_error,
*plot_partfrac, *plot_contfrac, *plot_cayley;
@ -626,13 +626,13 @@ int main(int argc, char** argv) {
}
rdata = type == 2
? higham((ZOLO_PRECISION) eps, n)
: zolotarev((ZOLO_PRECISION) eps, n, type);
? higham((PRECISION) eps, n)
: zolotarev((PRECISION) eps, n, type);
printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t"
STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
"\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
"\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
"\tPRECISION = " STRINGIFY(PRECISION)
"\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
"\tDelta = %g (maximum error)\n\n"
"\tA = %g (overall factor)\n",
@ -681,15 +681,15 @@ int main(int argc, char** argv) {
x = 2.4 * (float) m / plotpts - 1.2;
if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
/* skip x = 0 for type 1, as R(0) is singular */
y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
y = zolotarev_eval((PRECISION) x, rdata);
fprintf(plot_function, "%g %g\n", x, (float) y);
fprintf(plot_error, "%g %g\n",
x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
/ rdata -> Delta);
ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
/ rdata -> Delta);
ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
/ rdata -> Delta);
if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
maxypferr = MAX(maxypferr, fabs(ypferr));

View File

@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx);
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
#ifndef ZOLOTAREV_INTERNAL
#ifndef ZOLO_PRECISION
#define ZOLO_PRECISION double
#ifndef PRECISION
#define PRECISION double
#endif
#define ZPRECISION ZOLO_PRECISION
#define ZPRECISION PRECISION
#define ZOLOTAREV_DATA zolotarev_data
#endif
@ -77,8 +77,8 @@ typedef struct {
* zolotarev_data structure. The arguments must satisfy the constraints that
* epsilon > 0, n > 0, and type = 0 or 1. */
ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
void zolotarev_free(zolotarev_data *zdata);
#endif
@ -86,4 +86,3 @@ void zolotarev_free(zolotarev_data *zdata);
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#endif

View File

@ -1,34 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: BatchedBlas.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/algorithms/blas/BatchedBlas.h>
NAMESPACE_BEGIN(Grid);
gridblasHandle_t GridBLAS::gridblasHandle;
int GridBLAS::gridblasInit;
NAMESPACE_END(Grid);

View File

@ -1,727 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: BatchedBlas.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#ifdef GRID_HIP
#include <hipblas/hipblas.h>
#endif
#ifdef GRID_CUDA
#include <cublas_v2.h>
#endif
#ifdef GRID_SYCL
#include <oneapi/mkl.hpp>
#endif
#if 0
#define GRID_ONE_MKL
#endif
#ifdef GRID_ONE_MKL
#include <oneapi/mkl.hpp>
#endif
///////////////////////////////////////////////////////////////////////
// Need to rearrange lattice data to be in the right format for a
// batched multiply. Might as well make these static, dense packed
///////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
#ifdef GRID_HIP
typedef hipblasHandle_t gridblasHandle_t;
#endif
#ifdef GRID_CUDA
typedef cublasHandle_t gridblasHandle_t;
#endif
#ifdef GRID_SYCL
typedef cl::sycl::queue *gridblasHandle_t;
#endif
#ifdef GRID_ONE_MKL
typedef cl::sycl::queue *gridblasHandle_t;
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
typedef int32_t gridblasHandle_t;
#endif
enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
class GridBLAS {
public:
static gridblasHandle_t gridblasHandle;
static int gridblasInit;
static void Init(void)
{
if ( ! gridblasInit ) {
#ifdef GRID_CUDA
std::cout << "cublasCreate"<<std::endl;
cublasCreate(&gridblasHandle);
cublasSetPointerMode(gridblasHandle, CUBLAS_POINTER_MODE_DEVICE);
#endif
#ifdef GRID_HIP
std::cout << "hipblasCreate"<<std::endl;
hipblasCreate(&gridblasHandle);
#endif
#ifdef GRID_SYCL
gridblasHandle = theGridAccelerator;
#endif
#ifdef GRID_ONE_MKL
cl::sycl::cpu_selector selector;
cl::sycl::device selectedDevice { selector };
gridblasHandle =new sycl::queue (selectedDevice);
#endif
gridblasInit=1;
}
}
// Force construct once
GridBLAS() { Init(); };
~GridBLAS() { };
/////////////////////////////////////////////////////////////////////////////////////
// BLAS GEMM conventions:
/////////////////////////////////////////////////////////////////////////////////////
// - C = alpha A * B + beta C
// Dimensions:
// - C_m.n
// - A_m.k
// - B_k.n
// - Flops = 8 M N K
// - Bytes = 2*sizeof(word) * (MN+MK+KN)
// M=60, N=12
// Flop/Byte = 8 . 60.60.12 / (60.12+60.60+60.12)/16 = 4 so expect about 4 TF/s on a GCD
/////////////////////////////////////////////////////////////////////////////////////
void synchronise(void)
{
#ifdef GRID_HIP
auto err = hipDeviceSynchronize();
assert(err==hipSuccess);
#endif
#ifdef GRID_CUDA
auto err = cudaDeviceSynchronize();
assert(err==cudaSuccess);
#endif
#ifdef GRID_SYCL
accelerator_barrier();
#endif
#ifdef GRID_ONE_MKL
gridblasHandle->wait();
#endif
}
void gemmBatched(int m,int n, int k,
ComplexD alpha,
deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn,
ComplexD beta,
deviceVector<ComplexD*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(int m,int n, int k,
ComplexF alpha,
deviceVector<ComplexF*> &Amk, // pointer list to matrices
deviceVector<ComplexF*> &Bkn,
ComplexF beta,
deviceVector<ComplexF*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(int m,int n, int k,
RealD alpha,
deviceVector<RealD*> &Amk, // pointer list to matrices
deviceVector<RealD*> &Bkn,
RealD beta,
deviceVector<RealD*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(int m,int n, int k,
RealF alpha,
deviceVector<RealF*> &Amk, // pointer list to matrices
deviceVector<RealF*> &Bkn,
RealF beta,
deviceVector<RealF*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
ComplexD alpha,
deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn,
ComplexD beta,
deviceVector<ComplexD*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<ComplexD> alpha_p(1);
static deviceVector<ComplexD> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
RealD t0=usecond();
// std::cout << "ZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasZgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasDoubleComplex *) &alpha_p[0],
(hipblasDoubleComplex **)&Amk[0], lda,
(hipblasDoubleComplex **)&Bkn[0], ldb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex **)&Cmn[0], ldc,
batchCount);
// std::cout << " hipblas return code " <<(int)err<<std::endl;
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasZgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(cuDoubleComplex *) &alpha_p[0],
(cuDoubleComplex **)&Amk[0], lda,
(cuDoubleComplex **)&Bkn[0], ldb,
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
}
}
}
#endif
// synchronise();
RealD t1=usecond();
RealD flops = 8.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
// std::cout <<GridLogMessage<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
// std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
// std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
}
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
ComplexF alpha,
deviceVector<ComplexF*> &Amk, // pointer list to matrices
deviceVector<ComplexF*> &Bkn,
ComplexF beta,
deviceVector<ComplexF*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<ComplexF> alpha_p(1);
static deviceVector<ComplexF> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasCgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasComplex *) &alpha_p[0],
(hipblasComplex **)&Amk[0], lda,
(hipblasComplex **)&Bkn[0], ldb,
(hipblasComplex *) &beta_p[0],
(hipblasComplex **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasCgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(cuComplex *) &alpha_p[0],
(cuComplex **)&Amk[0], lda,
(cuComplex **)&Bkn[0], ldb,
(cuComplex *) &beta_p[0],
(cuComplex **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
ComplexF alphaf(real(alpha),imag(alpha));
ComplexF betaf(real(beta),imag(beta));
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexF c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ];
}
}
}
#endif
RealD t1=usecond();
RealD flops = 8.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
}
///////////////////////////////////////////////////////////////////////////
// Single precision real GEMM
///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
RealF alpha,
deviceVector<RealF*> &Amk, // pointer list to matrices
deviceVector<RealF*> &Bkn,
RealF beta,
deviceVector<RealF*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<RealF> alpha_p(1);
static deviceVector<RealF> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasSgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(float *) &alpha_p[0],
(float **)&Amk[0], lda,
(float **)&Bkn[0], ldb,
(float *) &beta_p[0],
(float **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasSgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(float *) &alpha_p[0],
(float **)&Amk[0], lda,
(float **)&Bkn[0], ldb,
(float *) &beta_p[0],
(float **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
RealD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
}
}
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
}
///////////////////////////////////////////////////////////////////////////
// Double precision real GEMM
///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
RealD alpha,
deviceVector<RealD*> &Amk, // pointer list to matrices
deviceVector<RealD*> &Bkn,
RealD beta,
deviceVector<RealD*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<RealD> alpha_p(1);
static deviceVector<RealD> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasDgemmBatched(gridblasHandle,
HIPBLAS_OP_N,
HIPBLAS_OP_N,
m,n,k,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasDgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
/*
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t batchCount64=batchCount;
oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
onemkl::transpose::N,
onemkl::transpose::N,
&m64,&n64,&k64,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
1,&batchCount64);
*/
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
RealD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
}
}
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
}
////////////////////////////////////////////////////////////////////////////////////////////////
// Strided case used by benchmark, but generally unused in Grid
// Keep a code example in double complex, but don't generate the single and real variants for now
////////////////////////////////////////////////////////////////////////////////////////////////
void gemmStridedBatched(int m,int n, int k,
ComplexD alpha,
ComplexD* Amk, // pointer list to matrices
ComplexD* Bkn,
ComplexD beta,
ComplexD* Cmn,
int batchCount)
{
// Use C-row major storage, so transpose calls
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
int sda = m*k;
int sdb = k*n;
int sdc = m*n;
deviceVector<ComplexD> alpha_p(1);
deviceVector<ComplexD> beta_p(1);
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
// std::cout << "blasZgemmStridedBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
// std::cout << "blasZgemmStridedBatched ld "<<lda<<","<<ldb<<","<<ldc<<std::endl;
// std::cout << "blasZgemmStridedBatched sd "<<sda<<","<<sdb<<","<<sdc<<std::endl;
#ifdef GRID_HIP
auto err = hipblasZgemmStridedBatched(gridblasHandle,
HIPBLAS_OP_N,
HIPBLAS_OP_N,
m,n,k,
(hipblasDoubleComplex *) &alpha_p[0],
(hipblasDoubleComplex *) Amk, lda, sda,
(hipblasDoubleComplex *) Bkn, ldb, sdb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex *) Cmn, ldc, sdc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasZgemmStridedBatched(gridblasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m,n,k,
(cuDoubleComplex *) &alpha_p[0],
(cuDoubleComplex *) Amk, lda, sda,
(cuDoubleComplex *) Bkn, ldb, sdb,
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex *) Cmn, ldc, sdc,
batchCount);
#endif
#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
oneapi::mkl::transpose::N,
oneapi::mkl::transpose::N,
m,n,k,
alpha,
(const ComplexD *)Amk,lda,sda,
(const ComplexD *)Bkn,ldb,sdb,
beta,
(ComplexD *)Cmn,ldc,sdc,
batchCount);
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
Cmn[mm + nn*ldc + p*sdc] = (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
}
}
}
#endif
}
double benchmark(int M, int N, int K, int BATCH)
{
int32_t N_A = M*K*BATCH;
int32_t N_B = K*N*BATCH;
int32_t N_C = M*N*BATCH;
deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
ComplexD alpha(1.0);
ComplexD beta (1.0);
RealD flops = 8.0*M*N*K*BATCH;
int ncall=10;
RealD t0 = usecond();
for(int i=0;i<ncall;i++){
gemmStridedBatched(M,N,K,
alpha,
&A[0], // m x k
&B[0], // k x n
beta,
&C[0], // m x n
BATCH);
}
synchronise();
RealD t1 = usecond();
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
flops = 8.0*M*N*K*BATCH*ncall;
flops = flops/(t1-t0)/1.e3;
return flops; // Returns gigaflops
}
};
NAMESPACE_END(Grid);

View File

@ -176,7 +176,6 @@ template<class T> using cshiftAllocator = std::allocator<T>;
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
NAMESPACE_END(Grid);

View File

@ -35,7 +35,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_transpose.h>
#include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_crc.h>
#include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_real_imag.h>
@ -47,4 +46,5 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h>
#include <Grid/lattice/Lattice_crc.h>
#include <Grid/lattice/PaddedCell.h>

View File

@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
basis_v.push_back(basis[k].View(AcceleratorWrite));
}
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
#if ( (!defined(GRID_CUDA)) )
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region

View File

@ -42,13 +42,13 @@ template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1
}
}
template<class vobj> uint32_t crc(const Lattice<vobj> & buf)
template<class vobj> uint32_t crc(Lattice<vobj> & buf)
{
autoView( buf_v , buf, CpuRead);
return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
}
#define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
NAMESPACE_END(Grid);

View File

@ -285,7 +285,6 @@ template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid();
ComplexD nrm = rankInnerProduct(left,right);
// std::cerr<<"flight log " << std::hexfloat << nrm <<" "<<crc(left)<<std::endl;
grid->GlobalSum(nrm);
return nrm;
}

View File

@ -119,13 +119,18 @@ template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, V
template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
{
autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
if constexpr (sizeof(vobj) <= 256) {
autoView(Data_v, Data, AcceleratorRead);
#if defined(GRID_CUDA)
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
#elif defined (GRID_HIP) //hipcub cannot deal with large vobjs that don't fit in shared memory, therefore separate into _small/_large.
if constexpr (sizeof(vobj) <= 256) {
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
}
else {
sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
}
#endif
}
#endif
@ -210,4 +215,4 @@ template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Ve
}
NAMESPACE_END(Grid);
NAMESPACE_END(Grid);

View File

@ -469,13 +469,15 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
Coordinate fine_rdimensions = fine->_rdimensions;
Coordinate coarse_rdimensions = coarse->_rdimensions;
vobj zz = Zero();
accelerator_for(sc,coarse->oSites(),1,{
// One thread per sub block
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
vobj cd = Zero();
vobj cd = zz;
for(int sb=0;sb<blockVol;sb++){

View File

@ -45,7 +45,6 @@ public:
};
// Host only
GridBase * getGrid(void) const { return _grid; };
vobj* getHostPointer(void) const { return _odata; };
};
/////////////////////////////////////////////////////////////////////////////////////////

View File

@ -179,11 +179,11 @@ extern GridLogger GridLogSolver;
extern GridLogger GridLogError;
extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage;
extern GridLogger GridLogDebug;
extern GridLogger GridLogDebug ;
extern GridLogger GridLogPerformance;
extern GridLogger GridLogDslash;
extern GridLogger GridLogIterative;
extern GridLogger GridLogIntegrator;
extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ;
extern GridLogger GridLogHMC;
extern GridLogger GridLogMemory;
extern GridLogger GridLogTracing;
@ -191,41 +191,6 @@ extern Colours GridLogColours;
std::string demangle(const char* name) ;
template<typename... Args>
inline std::string sjoin(Args&&... args) noexcept {
std::ostringstream msg;
(msg << ... << args);
return msg.str();
}
/*! @brief make log messages work like python print */
template <typename... Args>
inline void Grid_log(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << GridLogMessage << msg << std::endl;
}
/*! @brief make warning messages work like python print */
template <typename... Args>
inline void Grid_warn(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[33m" << GridLogWarning << msg << "\033[0m" << std::endl;
}
/*! @brief make error messages work like python print */
template <typename... Args>
inline void Grid_error(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[31m" << GridLogError << msg << "\033[0m" << std::endl;
}
/*! @brief make pass messages work like python print */
template <typename... Args>
inline void Grid_pass(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl;
}
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];

View File

@ -63,9 +63,7 @@ public:
virtual void MooeeDag(const FermionField &in, FermionField &out) ;
virtual void MooeeInv(const FermionField &in, FermionField &out) ;
virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
virtual void M(const FermionField &in, FermionField &out) ;
virtual void Mdag(const FermionField &in, FermionField &out) ;
private:
RealD mu; // TwistedMass parameter

View File

@ -280,16 +280,20 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
#endif
}
assert(0 && " Kernel optimisation case not covered ");
}
@ -318,13 +322,19 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
#endif
}
}

View File

@ -93,25 +93,5 @@ void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &ou
RealD b = tm /sq;
axpibg5x(out,in,a,b);
}
template<class Impl>
void WilsonTMFermion<Impl>::M(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
this->Dhop(in, out, DaggerNo);
FermionField tmp(out.Grid());
RealD a = 4.0+this->mass;
RealD b = this->mu;
axpibg5x(tmp,in,a,b);
axpy(out, 1.0, tmp, out);
}
template<class Impl>
void WilsonTMFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
this->Dhop(in, out, DaggerYes);
FermionField tmp(out.Grid());
RealD a = 4.0+this->mass;
RealD b = -this->mu;
axpibg5x(tmp,in,a,b);
axpy(out, 1.0, tmp, out);
}
NAMESPACE_END(Grid);

View File

@ -237,7 +237,7 @@ public:
for (int level = 0; level < as.size(); ++level) {
int multiplier = as.at(level).multiplier;
ActionLevel<Field, RepresentationPolicy> * Level = new ActionLevel<Field, RepresentationPolicy>(multiplier);
ActionLevel<Field> * Level = new ActionLevel<Field>(multiplier);
Level->push_back(new EmptyAction<Field>);
LevelForces.push_back(*Level);
// does it copy by value or reference??

View File

@ -1,389 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/smearing/HISQSmearing.h
Copyright (C) 2023
Author: D. A. Clarke <clarke.davida@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/*
@file HISQSmearing.h
@brief Declares classes related to HISQ smearing
*/
#pragma once
#include <Grid/Grid.h>
#include <Grid/lattice/PaddedCell.h>
#include <Grid/stencil/GeneralLocalStencil.h>
NAMESPACE_BEGIN(Grid);
// TODO: find a way to fold this into the stencil header. need to access grid to get
// Nd, since you don't want to inherit from QCD.h
/*! @brief append arbitrary shift path to shifts */
template<typename... Args>
void appendShift(std::vector<Coordinate>& shifts, int dir, Args... args) {
Coordinate shift(Nd,0);
generalShift(shift, dir, args...);
// push_back creates an element at the end of shifts and
// assigns the data in the argument to it.
shifts.push_back(shift);
}
/*! @brief figure out the stencil index from mu and nu */
accelerator_inline int stencilIndex(int mu, int nu) {
// Nshifts depends on how you built the stencil
int Nshifts = 6;
return Nshifts*nu + Nd*Nshifts*mu;
}
/*! @brief structure holding the link treatment */
struct SmearingParameters{
SmearingParameters(){}
Real c_1; // 1 link
Real c_naik; // Naik term
Real c_3; // 3 link
Real c_5; // 5 link
Real c_7; // 7 link
Real c_lp; // 5 link Lepage
SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
: c_1(c1),
c_naik(cnaik),
c_3(c3),
c_5(c5),
c_7(c7),
c_lp(clp){}
};
/*! @brief create fat links from link variables */
template<class Gimpl>
class Smear_HISQ : public Gimpl {
private:
GridCartesian* const _grid;
SmearingParameters _linkTreatment;
public:
INHERIT_GIMPL_TYPES(Gimpl);
typedef typename Gimpl::GaugeField GF;
typedef typename Gimpl::GaugeLinkField LF;
typedef typename Gimpl::ComplexField CF;
// Don't allow default values here.
Smear_HISQ(GridCartesian* grid, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
: _grid(grid),
_linkTreatment(c1,cnaik,c3,c5,c7,clp) {
assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
assert(Nd == 4 && "HISQ smearing only defined for Nd==4");
}
// Allow to pass a pointer to a C-style, double array for MILC convenience
Smear_HISQ(GridCartesian* grid, double* coeff)
: _grid(grid),
_linkTreatment(coeff[0],coeff[1],coeff[2],coeff[3],coeff[4],coeff[5]) {
assert(Nc == 3 && "HISQ smearing currently implemented only for Nc==3");
assert(Nd == 4 && "HISQ smearing only defined for Nd==4");
}
~Smear_HISQ() {}
// Intent: OUT--u_smr, u_naik
// IN--u_thin
void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {
SmearingParameters lt = this->_linkTreatment;
auto grid = this->_grid;
// Create a padded cell of extra padding depth=1 and fill the padding.
int depth = 1;
PaddedCell Ghost(depth,grid);
GF Ughost = Ghost.Exchange(u_thin);
// This is where auxiliary N-link fields and the final smear will be stored.
GF Ughost_fat(Ughost.Grid());
GF Ughost_3link(Ughost.Grid());
GF Ughost_5linkA(Ughost.Grid());
GF Ughost_5linkB(Ughost.Grid());
// mu-nu plane stencil. We allow mu==nu to make indexing the stencil easier,
// but these entries will not be used.
std::vector<Coordinate> shifts;
for(int mu=0;mu<Nd;mu++)
for(int nu=0;nu<Nd;nu++) {
appendShift(shifts,mu);
appendShift(shifts,nu);
appendShift(shifts,shiftSignal::NO_SHIFT);
appendShift(shifts,mu,Back(nu));
appendShift(shifts,Back(nu));
appendShift(shifts,Back(mu));
}
// A GeneralLocalStencil has two indices: a site and stencil index
GeneralLocalStencil gStencil(Ughost.Grid(),shifts);
// This is where contributions from the smearing get added together
Ughost_fat=Zero();
// This loop handles 3-, 5-, and 7-link constructs, minus Lepage and Naik.
for(int mu=0;mu<Nd;mu++) {
// TODO: This approach is slightly memory inefficient. It uses 25% extra memory
Ughost_3link =Zero();
Ughost_5linkA=Zero();
Ughost_5linkB=Zero();
// Create the accessors
autoView(U_v , Ughost , AcceleratorRead);
autoView(U_fat_v , Ughost_fat , AcceleratorWrite);
autoView(U_3link_v , Ughost_3link , AcceleratorWrite);
autoView(U_5linkA_v, Ughost_5linkA, AcceleratorWrite);
autoView(U_5linkB_v, Ughost_5linkB, AcceleratorWrite);
// We infer some types that will be needed in the calculation.
typedef decltype(gStencil.GetEntry(0,0)) stencilElement;
typedef decltype(coalescedReadGeneralPermute(U_v[0](0),gStencil.GetEntry(0,0)->_permute,Nd)) U3matrix;
int Nsites = U_v.size();
auto gStencil_v = gStencil.View();
accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 3-link constructs
stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
U3matrix U0, U1, U2, U3, U4, U5, W;
for(int nu=0;nu<Nd;nu++) {
if(nu==mu) continue;
int s = stencilIndex(mu,nu);
// The stencil gives us support points in the mu-nu plane that we will use to
// grab the links we need.
SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu = SE0->_offset;
SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset;
SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset;
SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset;
SE5 = gStencil_v.GetEntry(s+5,site); int x_m_mu = SE5->_offset;
// When you're deciding whether to take an adjoint, the question is: how is the
// stored link oriented compared to the one you want? If I imagine myself travelling
// with the to-be-updated link, I have two possible, alternative 3-link paths I can
// take, one starting by going to the left, the other starting by going to the right.
U0 = coalescedReadGeneralPermute(U_v[x_p_mu ](nu),SE0->_permute,Nd);
U1 = coalescedReadGeneralPermute(U_v[x_p_nu ](mu),SE1->_permute,Nd);
U2 = coalescedReadGeneralPermute(U_v[x ](nu),SE2->_permute,Nd);
U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd);
U4 = coalescedReadGeneralPermute(U_v[x_m_nu ](mu),SE4->_permute,Nd);
U5 = coalescedReadGeneralPermute(U_v[x_m_nu ](nu),SE4->_permute,Nd);
// "left" "right"
W = U2*U1*adj(U0) + adj(U5)*U4*U3;
// Save 3-link construct for later and add to smeared field.
coalescedWrite(U_3link_v[x](nu), W);
// The index operator (x) returns the coalesced read on GPU. The view [] index returns
// a reference to the vector object. The [x](mu) returns a reference to the densely
// packed (contiguous in memory) mu-th element of the vector object. On CPU,
// coalescedRead/Write is the identity mapping assigning vector object to vector object.
// But on GPU it's non-trivial and maps scalar object to vector object and vice versa.
coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_3*W);
}
})
accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 5-link
stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
U3matrix U0, U1, U2, U3, U4, U5, W;
int sigmaIndex = 0;
for(int nu=0;nu<Nd;nu++) {
if(nu==mu) continue;
int s = stencilIndex(mu,nu);
for(int rho=0;rho<Nd;rho++) {
if (rho == mu || rho == nu) continue;
SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu = SE0->_offset;
SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset;
SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset;
SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset;
U0 = coalescedReadGeneralPermute( U_v[x_p_mu ](nu ),SE0->_permute,Nd);
U1 = coalescedReadGeneralPermute(U_3link_v[x_p_nu ](rho),SE1->_permute,Nd);
U2 = coalescedReadGeneralPermute( U_v[x ](nu ),SE2->_permute,Nd);
U3 = coalescedReadGeneralPermute( U_v[x_p_mu_m_nu](nu ),SE3->_permute,Nd);
U4 = coalescedReadGeneralPermute(U_3link_v[x_m_nu ](rho),SE4->_permute,Nd);
U5 = coalescedReadGeneralPermute( U_v[x_m_nu ](nu ),SE4->_permute,Nd);
W = U2*U1*adj(U0) + adj(U5)*U4*U3;
if(sigmaIndex<3) {
coalescedWrite(U_5linkA_v[x](rho), W);
} else {
coalescedWrite(U_5linkB_v[x](rho), W);
}
coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_5*W);
sigmaIndex++;
}
}
})
accelerator_for(site,Nsites,Simd::Nsimd(),{ // ----------- 7-link
stencilElement SE0, SE1, SE2, SE3, SE4, SE5;
U3matrix U0, U1, U2, U3, U4, U5, W;
int sigmaIndex = 0;
for(int nu=0;nu<Nd;nu++) {
if(nu==mu) continue;
int s = stencilIndex(mu,nu);
for(int rho=0;rho<Nd;rho++) {
if (rho == mu || rho == nu) continue;
SE0 = gStencil_v.GetEntry(s+0,site); int x_p_mu = SE0->_offset;
SE1 = gStencil_v.GetEntry(s+1,site); int x_p_nu = SE1->_offset;
SE2 = gStencil_v.GetEntry(s+2,site); int x = SE2->_offset;
SE3 = gStencil_v.GetEntry(s+3,site); int x_p_mu_m_nu = SE3->_offset;
SE4 = gStencil_v.GetEntry(s+4,site); int x_m_nu = SE4->_offset;
U0 = coalescedReadGeneralPermute(U_v[x_p_mu](nu),SE0->_permute,Nd);
if(sigmaIndex<3) {
U1 = coalescedReadGeneralPermute(U_5linkB_v[x_p_nu](rho),SE1->_permute,Nd);
} else {
U1 = coalescedReadGeneralPermute(U_5linkA_v[x_p_nu](rho),SE1->_permute,Nd);
}
U2 = coalescedReadGeneralPermute(U_v[x](nu),SE2->_permute,Nd);
U3 = coalescedReadGeneralPermute(U_v[x_p_mu_m_nu](nu),SE3->_permute,Nd);
if(sigmaIndex<3) {
U4 = coalescedReadGeneralPermute(U_5linkB_v[x_m_nu](rho),SE4->_permute,Nd);
} else {
U4 = coalescedReadGeneralPermute(U_5linkA_v[x_m_nu](rho),SE4->_permute,Nd);
}
U5 = coalescedReadGeneralPermute(U_v[x_m_nu](nu),SE4->_permute,Nd);
W = U2*U1*adj(U0) + adj(U5)*U4*U3;
coalescedWrite(U_fat_v[x](mu), U_fat_v(x)(mu) + lt.c_7*W);
sigmaIndex++;
}
}
})
} // end mu loop
// c1, c3, c5, c7 construct contributions
u_smr = Ghost.Extract(Ughost_fat) + lt.c_1*u_thin;
// Load up U and V std::vectors to access thin and smeared links.
std::vector<LF> U(Nd, grid);
std::vector<LF> V(Nd, grid);
std::vector<LF> Vnaik(Nd, grid);
for (int mu = 0; mu < Nd; mu++) {
U[mu] = PeekIndex<LorentzIndex>(u_thin, mu);
V[mu] = PeekIndex<LorentzIndex>(u_smr, mu);
}
for(int mu=0;mu<Nd;mu++) {
// Naik
Vnaik[mu] = lt.c_naik*Gimpl::CovShiftForward(U[mu],mu,
Gimpl::CovShiftForward(U[mu],mu,
Gimpl::CovShiftIdentityForward(U[mu],mu)));
// LePage
for (int nu_h=1;nu_h<Nd;nu_h++) {
int nu=(mu+nu_h)%Nd;
// nu, nu, mu, Back(nu), Back(nu)
V[mu] = V[mu] + lt.c_lp*Gimpl::CovShiftForward(U[nu],nu,
Gimpl::CovShiftForward(U[nu],nu,
Gimpl::CovShiftForward(U[mu],mu,
Gimpl::CovShiftBackward(U[nu],nu,
Gimpl::CovShiftIdentityBackward(U[nu],nu)))))
// Back(nu), Back(nu), mu, nu, nu
+ lt.c_lp*Gimpl::CovShiftBackward(U[nu],nu,
Gimpl::CovShiftBackward(U[nu],nu,
Gimpl::CovShiftForward(U[mu],mu,
Gimpl::CovShiftForward(U[nu],nu,
Gimpl::CovShiftIdentityForward(U[nu],nu)))));
}
}
// Put V back into u_smr.
for (int mu = 0; mu < Nd; mu++) {
PokeIndex<LorentzIndex>(u_smr , V[mu] , mu);
PokeIndex<LorentzIndex>(u_naik, Vnaik[mu], mu);
}
};
// Intent: OUT--u_proj
// IN--u_mu
void projectU3(GF& u_proj, GF& u_mu) const {
auto grid = this->_grid;
LF V(grid), Q(grid), sqrtQinv(grid), id_3(grid), diff(grid);
CF c0(grid), c1(grid), c2(grid), g0(grid), g1(grid), g2(grid), S(grid), R(grid), theta(grid),
u(grid), v(grid), w(grid), den(grid), f0(grid), f1(grid), f2(grid);
// Follow MILC 10.1103/PhysRevD.82.074501, eqs (B2-B3) and (C1-C8)
for (int mu = 0; mu < Nd; mu++) {
V = PeekIndex<LorentzIndex>(u_mu, mu);
Q = adj(V)*V;
c0 = real(trace(Q));
c1 = (1/2.)*real(trace(Q*Q));
c2 = (1/3.)*real(trace(Q*Q*Q));
S = (1/3.)*c1-(1/18.)*c0*c0;
if (norm2(S)<1e-28) {
g0 = (1/3.)*c0; g1 = g0; g2 = g1;
} else {
R = (1/2.)*c2-(1/3. )*c0*c1+(1/27.)*c0*c0*c0;
theta = acos(R*pow(S,-1.5));
g0 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta-2*M_PI/3.);
g1 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta );
g2 = (1/3.)*c0+2.*sqrt(S)*cos((1/3.)*theta+2*M_PI/3.);
}
// if (fabs(Q.determinant()/(g0*g1*g2)-1.0) > 1e-5) { SVD }
u = sqrt(g0) + sqrt(g1) + sqrt(g2);
v = sqrt(g0*g1) + sqrt(g0*g2) + sqrt(g1*g2);
w = sqrt(g0*g1*g2);
den = w*(u*v-w);
f0 = (-w*(u*u+v)+u*v*v)/den;
f1 = (-w-u*u*u+2.*u*v)/den;
f2 = u/den;
id_3 = 1.;
sqrtQinv = f0*id_3 + f1*Q + f2*Q*Q;
PokeIndex<LorentzIndex>(u_proj, V*sqrtQinv, mu);
}
};
// void derivative(const GaugeField& Gauge) const {
// };
};
NAMESPACE_END(Grid);

View File

@ -5,5 +5,4 @@
#include <Grid/qcd/smearing/StoutSmearing.h>
#include <Grid/qcd/smearing/GaugeConfiguration.h>
#include <Grid/qcd/smearing/WilsonFlow.h>
#include <Grid/qcd/smearing/HISQSmearing.h>

View File

@ -1133,13 +1133,4 @@ static_assert(sizeof(SIMD_Ftype) == sizeof(SIMD_Itype), "SIMD vector lengths inc
NAMESPACE_END(Grid);
#ifdef GRID_SYCL
template<> struct sycl::is_device_copyable<Grid::vComplexF> : public std::true_type {};
template<> struct sycl::is_device_copyable<Grid::vComplexD> : public std::true_type {};
template<> struct sycl::is_device_copyable<Grid::vRealF > : public std::true_type {};
template<> struct sycl::is_device_copyable<Grid::vRealD > : public std::true_type {};
template<> struct sycl::is_device_copyable<Grid::vInteger > : public std::true_type {};
#endif
#endif

View File

@ -137,55 +137,5 @@ public:
};
////////////////////////////////////////////////
// Some machinery to streamline making a stencil
////////////////////////////////////////////////
class shiftSignal {
public:
enum {
BACKWARD_CONST = 16,
NO_SHIFT = -1
};
};
// TODO: put a check somewhere that BACKWARD_CONST > Nd!
/*! @brief signals that you want to go backwards in direction dir */
inline int Back(const int dir) {
// generalShift will use BACKWARD_CONST to determine whether we step forward or
// backward. Trick inspired by SIMULATeQCD.
return dir + shiftSignal::BACKWARD_CONST;
}
/*! @brief shift one unit in direction dir */
template<typename... Args>
void generalShift(Coordinate& shift, int dir) {
if (dir >= shiftSignal::BACKWARD_CONST) {
dir -= shiftSignal::BACKWARD_CONST;
shift[dir]+=-1;
} else if (dir == shiftSignal::NO_SHIFT) {
; // do nothing
} else {
shift[dir]+=1;
}
}
/*! @brief follow a path of directions, shifting one unit in each direction */
template<typename... Args>
void generalShift(Coordinate& shift, int dir, Args... args) {
if (dir >= shiftSignal::BACKWARD_CONST) {
dir -= shiftSignal::BACKWARD_CONST;
shift[dir]+=-1;
} else if (dir == shiftSignal::NO_SHIFT) {
; // do nothing
} else {
shift[dir]+=1;
}
generalShift(shift, args...);
}
NAMESPACE_END(Grid);

View File

@ -706,7 +706,7 @@ public:
}
}
}
//std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
}
/// Introduce a block structure and switch off comms on boundaries
void DirichletBlock(const Coordinate &dirichlet_block)
@ -761,8 +761,7 @@ public:
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances,
Parameters p=Parameters(),
bool preserve_shm=false)
Parameters p=Parameters())
{
face_table_computed=0;
_grid = grid;
@ -856,9 +855,7 @@ public:
/////////////////////////////////////////////////////////////////////////////////
const int Nsimd = grid->Nsimd();
// Allow for multiple stencils to exist simultaneously
if (!preserve_shm)
_grid->ShmBufferFreeAll();
_grid->ShmBufferFreeAll();
int maxl=2;
u_simd_send_buf.resize(maxl);

View File

@ -404,12 +404,3 @@ NAMESPACE_BEGIN(Grid);
};
NAMESPACE_END(Grid);
#ifdef GRID_SYCL
template<typename T> struct
sycl::is_device_copyable<T, typename std::enable_if<
Grid::isGridTensor<T>::value && (!std::is_trivially_copyable<T>::value),
void>::type>
: public std::true_type {};
#endif

View File

@ -255,13 +255,17 @@ inline int acceleratorIsCommunicable(void *ptr)
#define GRID_SYCL_LEVEL_ZERO_IPC
NAMESPACE_END(Grid);
// Force deterministic reductions
#define SYCL_REDUCTION_DETERMINISTIC
#if 0
#include <CL/sycl.hpp>
#include <CL/sycl/usm.hpp>
#include <level_zero/ze_api.h>
#include <CL/sycl/backend/level_zero.hpp>
#else
#include <sycl/CL/sycl.hpp>
#include <sycl/usm.hpp>
#include <level_zero/ze_api.h>
#include <sycl/ext/oneapi/backend/level_zero.hpp>
#endif
NAMESPACE_BEGIN(Grid);
@ -285,24 +289,23 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
unsigned long nt=acceleratorThreads(); \
if(nt < 8)nt=8; \
unsigned long unum1 = num1; \
unsigned long unum2 = num2; \
unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \
cl::sycl::range<3> local {nt,1,nsimd}; \
cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \
cgh.parallel_for( \
cl::sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
[[intel::reqd_sub_group_size(16)]] \
{ \
auto iter1 = item.get_global_id(0); \
auto iter2 = item.get_global_id(1); \
auto lane = item.get_global_id(2); \
{ if (iter1 < unum1){ __VA_ARGS__ } }; \
}); \
});
unsigned long nt=acceleratorThreads(); \
unsigned long unum1 = num1; \
unsigned long unum2 = num2; \
if(nt < 8)nt=8; \
cl::sycl::range<3> local {nt,1,nsimd}; \
cl::sycl::range<3> global{unum1,unum2,nsimd}; \
cgh.parallel_for( \
cl::sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
[[intel::reqd_sub_group_size(16)]] \
{ \
auto iter1 = item.get_global_id(0); \
auto iter2 = item.get_global_id(1); \
auto lane = item.get_global_id(2); \
{ __VA_ARGS__ }; \
}); \
});
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }

View File

@ -77,10 +77,6 @@ feenableexcept (unsigned int excepts)
}
#endif
#ifndef HOST_NAME_MAX
#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
#endif
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////
@ -397,9 +393,6 @@ void Grid_init(int *argc,char ***argv)
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
std::cout << GridLogMessage << "================================================ "<<std::endl;
char hostname[HOST_NAME_MAX+1];
gethostname(hostname, HOST_NAME_MAX+1);
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
/////////////////////////////////////////////////////////
// Reporting

View File

@ -1,968 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_usqcd.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/algorithms/blas/BatchedBlas.h>
using namespace Grid;
std::vector<int> L_list;
std::vector<int> Ls_list;
std::vector<double> mflop_list;
double mflop_ref;
double mflop_ref_err;
int NN_global;
FILE * FP;
struct time_statistics{
double mean;
double err;
double min;
double max;
void statistics(std::vector<double> v){
double sum = std::accumulate(v.begin(), v.end(), 0.0);
mean = sum / v.size();
std::vector<double> diff(v.size());
std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
auto result = std::minmax_element(v.begin(), v.end());
min = *result.first;
max = *result.second;
}
};
void comms_header(){
std::cout <<GridLogMessage << " L "<<"\t"<<" Ls "<<"\t"
<<"bytes\t MB/s uni \t\t MB/s bidi "<<std::endl;
};
struct controls {
int Opt;
int CommsOverlap;
Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
};
class Benchmark {
public:
static void Decomposition (void ) {
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
std::cout<<GridLogMessage<<"\tMPI tasks : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
std::cout<<GridLogMessage<<"\tvReal : "<<sizeof(vReal )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvRealF : "<<sizeof(vRealF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvRealD : "<<sizeof(vRealD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplex : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplexF : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
std::cout<<GridLogMessage<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
static void Comms(void)
{
int Nloop=200;
int nmu=0;
int maxlat=32;
Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
std::vector<double> t_time(Nloop);
time_statistics timestat;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
comms_header();
fprintf(FP,"Communications\n\n");
fprintf(FP,"Packet bytes, direction, GB/s per node\n");
for(int lat=16;lat<=maxlat;lat+=8){
// for(int Ls=8;Ls<=8;Ls*=2){
{ int Ls=12;
Coordinate latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
//Grid.ShmBufferFreeAll();
uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
// bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
// bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
}
// int ncomm;
double dbytes;
for(int dir=0;dir<8;dir++) {
int mu =dir % 4;
if (mpi_layout[mu]>1 ) {
std::vector<double> times(Nloop);
for(int i=0;i<Nloop;i++){
dbytes=0;
double start=usecond();
int xmit_to_rank;
int recv_from_rank;
if ( dir == mu ) {
int comm_proc=1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
} else {
int comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
}
Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
(void *)&rbuf[dir][0], recv_from_rank,
bytes);
dbytes+=bytes;
double stop=usecond();
t_time[i] = stop-start; // microseconds
}
timestat.statistics(t_time);
dbytes=dbytes*ppn;
double xbytes = dbytes*0.5;
double bidibytes = dbytes;
std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
<< bytes << " \t "
<<xbytes/timestat.mean
<< "\t\t"
<< bidibytes/timestat.mean<< std::endl;
fprintf(FP,"%ld, %d, %f\n",(long)bytes,dir,bidibytes/timestat.mean/1000.);
}
}
for(int d=0;d<8;d++){
acceleratorFreeDevice(xbuf[d]);
acceleratorFreeDevice(rbuf[d]);
}
}
}
fprintf(FP,"\n\n");
return;
}
static void Memory(void)
{
const int Nvec=8;
typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
typedef iVector<vReal,Nvec> Vec;
Coordinate simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
fprintf(FP,"Memory Bandwidth\n\n");
fprintf(FP,"Bytes, GB/s per node\n");
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
// uint64_t NP;
uint64_t NN;
uint64_t lmax=40;
#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
for(int lat=8;lat<=lmax;lat+=8){
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
// NP= Grid.RankCount();
NN =Grid.NodeCount();
Vec rn ; random(sRNG,rn);
LatticeVec z(&Grid); z=Zero();
LatticeVec x(&Grid); x=Zero();
LatticeVec y(&Grid); y=Zero();
double a=2.0;
uint64_t Nloop=NLOOP;
double start=usecond();
for(int i=0;i<Nloop;i++){
z=a*x-y;
}
double stop=usecond();
double time = (stop-start)/Nloop*1000;
double flops=vol*Nvec*2;// mul,add
double bytes=3.0*vol*Nvec*sizeof(Real);
std::cout<<GridLogMessage<<std::setprecision(3)
<< lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
<< "\t\t"<< bytes/time/NN <<std::endl;
fprintf(FP,"%ld, %f\n",(long)bytes,bytes/time/NN);
}
fprintf(FP,"\n\n");
};
static void BLAS(void)
{
//int nbasis, int nrhs, int coarseVol
int basis[] = { 16,32,64 };
int rhs[] = { 8,16,32 };
int vol = 4*4*4*4;
GridBLAS blas;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= batched GEMM (double precision) "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (coarse mrhs)"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
fprintf(FP,"GEMM\n\n M, N, K, BATCH, GF/s per rank\n");
for(int b=0;b<3;b++){
for(int r=0;r<3;r++){
int M=basis[b];
int N=rhs[r];
int K=basis[b];
int BATCH=vol;
double p=blas.benchmark(M,N,K,BATCH);
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
std::cout<<GridLogMessage<<std::setprecision(3)
<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
}}
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block project)"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int b=0;b<3;b++){
for(int r=0;r<3;r++){
int M=basis[b];
int N=rhs[r];
int K=vol;
int BATCH=vol;
double p=blas.benchmark(M,N,K,BATCH);
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
std::cout<<GridLogMessage<<std::setprecision(3)
<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
}}
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
std::cout<<GridLogMessage << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (block promote)"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int b=0;b<3;b++){
for(int r=0;r<3;r++){
int M=rhs[r];
int N=vol;
int K=basis[b];
int BATCH=vol;
double p=blas.benchmark(M,N,K,BATCH);
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, BATCH, p);
std::cout<<GridLogMessage<<std::setprecision(3)
<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
}}
fprintf(FP,"\n\n\n");
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
};
static void SU4(void)
{
const int Nc4=4;
typedef Lattice< iMatrix< vComplexF,Nc4> > LatticeSU4;
Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking z = y*x SU(4) bandwidth"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
uint64_t NN;
uint64_t lmax=32;
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
for(int lat=8;lat<=lmax;lat+=8){
Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
NN =Grid.NodeCount();
LatticeSU4 z(&Grid); z=Zero();
LatticeSU4 x(&Grid); x=Zero();
LatticeSU4 y(&Grid); y=Zero();
// double a=2.0;
uint64_t Nloop=NLOOP;
double start=usecond();
for(int i=0;i<Nloop;i++){
z=x*y;
}
double stop=usecond();
double time = (stop-start)/Nloop*1000;
double flops=vol*Nc4*Nc4*(6+(Nc4-1)*8);// mul,add
double bytes=3.0*vol*Nc4*Nc4*2*sizeof(RealF);
std::cout<<GridLogMessage<<std::setprecision(3)
<< lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
<< "\t\t"<< bytes/time/NN <<std::endl;
}
};
static double DWF(int Ls,int L)
{
RealD mass=0.1;
RealD M5 =1.8;
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L});
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Nc : "<<Nc<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* Ls : "<<Ls<<std::endl;
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
typedef DomainWallFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
///////// Source preparation ////////////
Gauge Umu(UGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
Fermion src (FGrid); random(RNG5,src);
Fermion src_e (FrbGrid);
Fermion src_o (FrbGrid);
Fermion r_e (FrbGrid);
Fermion r_o (FrbGrid);
Fermion r_eo (FGrid);
Action Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
#ifdef AVX512
const int num_cases = 3;
#else
const int num_cases = 2;
#endif
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases [] = {
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ WilsonKernelsStatic::OptHandUnroll, WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ WilsonKernelsStatic::OptInlineAsm , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }
};
for(int c=0;c<num_cases;c++) {
WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using ASM WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using UNROLLED WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential Comms/Compute" <<std::endl;
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 10;
double t0=usecond();
FGrid->Barrier();
for(int i=0;i<nwarm;i++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
Dw.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
// Nc=3 gives
// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2
// double flops=(1344.0*volume)/2;
double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns + 2*Nd*Nc*Ns*2;
double flops=(fps*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage<< "Deo FlopsPerSite is "<<fps<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
}
return mflops_best;
}
static double Staggered(int L)
{
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L});
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark ImprovedStaggered on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
RealD mass=0.1;
RealD c1=9.0/8.0;
RealD c2=-1.0/24.0;
RealD u0=1.0;
typedef ImprovedStaggeredFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
Gauge Umu(FGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
typename Action::ImplParams params;
Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params);
///////// Source preparation ////////////
Fermion src (FGrid); random(RNG4,src);
Fermion src_e (FrbGrid);
Fermion src_o (FrbGrid);
Fermion r_e (FrbGrid);
Fermion r_o (FrbGrid);
Fermion r_eo (FGrid);
{
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
const int num_cases = 2;
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases [] = {
{ StaggeredKernelsStatic::OptGeneric , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ StaggeredKernelsStatic::OptHandUnroll, StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
{ StaggeredKernelsStatic::OptInlineAsm , StaggeredKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent }
};
for(int c=0;c<num_cases;c++) {
StaggeredKernelsStatic::Comms = Cases[c].CommsOverlap;
StaggeredKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
if ( StaggeredKernelsStatic::Opt == StaggeredKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc StaggeredKernels" <<std::endl;
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 10;
double t0=usecond();
FGrid->Barrier();
for(int i=0;i<nwarm;i++){
Ds.DhopEO(src_o,r_e,DaggerNo);
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
Ds.DhopEO(src_o,r_e,DaggerNo);
t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
double volume=1; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1146.0*volume)/2;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
return mflops_best;
}
static double Clover(int L)
{
double mflops;
double mflops_best = 0;
double mflops_worst= 0;
std::vector<double> mflops_all;
///////////////////////////////////////////////////////
// Set/Get the layout & grid size
///////////////////////////////////////////////////////
int threads = GridThread::GetThreads();
Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4);
Coordinate local({L,L,L,L});
Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]});
GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4,
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
uint64_t NP = TmpGrid->RankCount();
uint64_t NN = TmpGrid->NodeCount();
NN_global=NN;
uint64_t SHM=NP/NN;
///////// Welcome message ////////////
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "Benchmark Clover on "<<L<<"^4 local volume "<<std::endl;
std::cout<<GridLogMessage << "* Global volume : "<<GridCmdVectorIntToString(latt4)<<std::endl;
std::cout<<GridLogMessage << "* ranks : "<<NP <<std::endl;
std::cout<<GridLogMessage << "* nodes : "<<NN <<std::endl;
std::cout<<GridLogMessage << "* ranks/node : "<<SHM <<std::endl;
std::cout<<GridLogMessage << "* ranks geom : "<<GridCmdVectorIntToString(mpi)<<std::endl;
std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
///////// Lattice Init ////////////
GridCartesian * FGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(FGrid);
///////// RNG Init ////////////
std::vector<int> seeds4({1,2,3,4});
GridParallelRNG RNG4(FGrid); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
RealD mass=0.1;
RealD csw=1.0;
typedef WilsonCloverFermionF Action;
typedef typename Action::FermionField Fermion;
typedef LatticeGaugeFieldF Gauge;
Gauge Umu(FGrid); SU<Nc>::HotConfiguration(RNG4,Umu);
Action Dc(Umu,*FGrid,*FrbGrid,mass,csw,csw);
///////// Source preparation ////////////
Fermion src (FGrid); random(RNG4,src);
Fermion r (FGrid);
{
const int num_cases = 1;
std::string fmt("G/S/C ; G/O/C ; G/S/S ; G/O/S ");
controls Cases [] = {
{ WilsonKernelsStatic::OptGeneric , WilsonKernelsStatic::CommsAndCompute ,CartesianCommunicator::CommunicatorPolicyConcurrent },
};
for(int c=0;c<num_cases;c++) {
WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
WilsonKernelsStatic::Opt = Cases[c].Opt;
CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int nwarm = 10;
double t0=usecond();
FGrid->Barrier();
for(int i=0;i<nwarm;i++){
Dc.M(src,r);
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
Dc.M(src,r);
t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
double volume=1; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344+ 24+6*6*8*2)*volume;
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Best mflop/s = "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage << L<<"^4 Deo Worst mflop/s = "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
std::cout<<GridLogMessage <<fmt << std::endl;
std::cout<<GridLogMessage ;
for(int i=0;i<mflops_all.size();i++){
std::cout<<mflops_all[i]/NN<<" ; " ;
}
std::cout<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
return mflops_best;
}
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
if (GlobalSharedMemory::WorldRank==0) {
FP = fopen("Benchmark_usqcd.csv","w");
} else {
FP = fopen("/dev/null","w");
}
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
Benchmark::Decomposition();
int do_su4=0;
int do_memory=1;
int do_comms =1;
int do_blas =1;
int sel=4;
std::vector<int> L_list({8,12,16,24,32});
int selm1=sel-1;
std::vector<double> clover;
std::vector<double> dwf4;
std::vector<double> staggered;
int Ls=1;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Clover dslash 4D vectorised (temporarily Wilson)" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
clover.push_back(Benchmark::DWF(1,L_list[l]));
}
Ls=12;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double result = Benchmark::DWF(Ls,L_list[l]) ;
dwf4.push_back(result);
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Improved Staggered dslash 4D vectorised" <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
for(int l=0;l<L_list.size();l++){
double result = Benchmark::Staggered(L_list[l]) ;
staggered.push_back(result);
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "L \t\t Clover \t\t DWF4 \t\t Staggered" <<std::endl;
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]<<" \t\t "<<dwf4[l] << " \t\t "<< staggered[l]<<std::endl;
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
int NN=NN_global;
if ( do_memory ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::Memory();
}
if ( do_blas ) {
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Batched BLAS benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::BLAS();
#endif
}
if ( do_su4 ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::SU4();
}
if ( do_comms ) {
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Benchmark::Comms();
}
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L \t\t Clover\t\t DWF4\t\t Staggered (GF/s per node)" <<std::endl;
fprintf(FP,"Per node summary table\n");
fprintf(FP,"\n");
fprintf(FP,"L , Wilson, DWF4, Staggered, GF/s per node\n");
fprintf(FP,"\n");
for(int l=0;l<L_list.size();l++){
std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< clover[l]/NN<<" \t "<<dwf4[l]/NN<< " \t "<<staggered[l]/NN<<std::endl;
fprintf(FP,"%d , %.0f, %.0f, %.0f\n",L_list[l],clover[l]/NN/1000.,dwf4[l]/NN/1000.,staggered[l]/NN/1000.);
}
fprintf(FP,"\n");
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
std::cout<<GridLogMessage << " Comparison point result: " << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
std::cout<<std::setprecision(3);
std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
Grid_finalize();
fclose(FP);
}

View File

@ -1,12 +1,12 @@
#!/usr/bin/env bash
set -e
EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.bz2'
EIGEN_SHA256SUM='b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626'
EIGEN_URL='https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.bz2'
EIGEN_SHA256SUM='685adf14bd8e9c015b78097c1dc22f2f01343756f196acdc76a678e1ae352e11'
echo "-- deploying Eigen source..."
ARC=$(basename ${EIGEN_URL})
ARC=`basename ${EIGEN_URL}`
wget ${EIGEN_URL} --no-check-certificate
if command -v sha256sum; then
echo "$EIGEN_SHA256SUM $(basename "$EIGEN_URL")" \
@ -14,8 +14,13 @@ if command -v sha256sum; then
else
echo "WARNING: could not verify checksum, please install sha256sum" >&2
fi
./scripts/update_eigen.sh "${ARC}"
rm "${ARC}"
./scripts/update_eigen.sh ${ARC}
rm ${ARC}
# patch for non-portable includes in Eigen 3.3.5
# apparently already fixed in Eigen HEAD so it should not be
# a problem in the future (A.P.)
patch Eigen/unsupported/Eigen/CXX11/Tensor scripts/eigen-3.3.5.Tensor.patch
echo '-- generating Make.inc files...'
./scripts/filelist
echo '-- generating configure script...'

View File

@ -1,183 +0,0 @@
/*
* Example_plaquette.cc
*
* D. Clarke
*
* Here I just want to create an incredibly simple main to get started with GRID and get used
* to its syntax. If the reader is like me, they vaguely understand something about lattice coding,
* they don't know a ton of C++, don't know much of the fine details, and certainly know nothing about GRID.
*
* Once you've made a new executable, like this one, you can bootstrap.sh again. At this point,
* the code should be able to find your new executable. You can tell that bootstrap.sh worked by
* having a look at Make.inc. You should see your executable inside there.
*
* Warning: This code illustrative only, not well tested, and not meant for production use. The best
* way to read this code is to start at the main.
*
*/
// All your mains should have this
#include <Grid/Grid.h>
using namespace Grid;
// This copies what already exists in WilsonLoops.h. The point here is to be pedagogical and explain in
// detail what everything does so we can see how GRID works.
template <class Gimpl> class WLoops : public Gimpl {
public:
// Gimpl seems to be an arbitrary class. Within this class, it is expected that certain types are
// already defined, things like Scalar and Field. This macro includes a bunch of #typedefs that
// implement this equivalence at compile time.
INHERIT_GIMPL_TYPES(Gimpl);
// Some example Gimpls can be found in GaugeImplementations.h, at the bottom. These are in turn built
// out of GaugeImplTypes, which can be found in GaugeImplTypes.h. The GaugeImplTypes contain the base
// field/vector/link/whatever types. These inherit from iScalar, iVector, and iMatrix objects, which
// are sort of the building blocks for gerenal math objects. The "i" at the beginning of these names
// indicates that they should be for internal use only. It seems like these base types have the
// acceleration, e.g. SIMD or GPU or what-have-you, abstracted away. How you accelerate these things
// appears to be controlled through a template parameter called vtype.
// The general math/physics objects, such as a color matrix, are built up by nesting these objects.
// For instance a general color matrix has two color indices, so it's built up like
// iScalar<iScalar<iMatrix<vtype ...
// where the levels going from the inside out are color, spin, then Lorentz indices. Scalars have
// no indices, so it's what we use when such an index isn't needed. Lattice objects are made by one
// higher level of indexing using iVector.
// These types will be used for U and U_mu objects, respectively.
typedef typename Gimpl::GaugeLinkField GaugeMat;
typedef typename Gimpl::GaugeField GaugeLorentz;
// U_mu_nu(x)
static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U, const int mu, const int nu) {
// Calls like CovShiftForward and CovShiftBackward have 3 arguments, and they multiply together
// the first and last argument. (Second arg gives the shift direction.) The CovShiftIdentityBackward
// has meanwhile only two arguments; it just returns the shifted (adjoint since backward) link.
plaq = Gimpl::CovShiftForward(U[mu],mu,
// Means Link*Cshift(field,mu,1), arguments are Link, mu, field in that order.
Gimpl::CovShiftForward(U[nu],nu,
Gimpl::CovShiftBackward(U[mu],mu,
// This means Cshift(adj(Link), mu, -1)
Gimpl::CovShiftIdentityBackward(U[nu], nu))));
}
// tr U_mu_nu(x)
static void traceDirPlaquette(ComplexField &plaq, const std::vector<GaugeMat> &U, const int mu, const int nu) {
// This .Grid() syntax seems to get the pointer to the GridBase. Apparently this is needed as argument
// to instantiate a Lattice object.
GaugeMat sp(U[0].Grid());
dirPlaquette(sp, U, mu, nu);
plaq = trace(sp);
}
// sum_mu_nu tr U_mu_nu(x)
static void sitePlaquette(ComplexField &Plaq, const std::vector<GaugeMat> &U) {
ComplexField sitePlaq(U[0].Grid());
Plaq = Zero();
// Nd=4 and Nc=3 are set as global constants in QCD.h
for (int mu = 1; mu < Nd; mu++) {
for (int nu = 0; nu < mu; nu++) {
traceDirPlaquette(sitePlaq, U, mu, nu);
Plaq = Plaq + sitePlaq;
}
}
}
// sum_mu_nu_x Re tr U_mu_nu(x)
static RealD sumPlaquette(const GaugeLorentz &Umu) {
std::vector<GaugeMat> U(Nd, Umu.Grid());
for (int mu = 0; mu < Nd; mu++) {
// Umu is a GaugeLorentz object, and as such has a non-trivial Lorentz index. We can
// access the element in the mu Lorentz index with this PeekIndex syntax.
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
ComplexField Plaq(Umu.Grid());
sitePlaquette(Plaq, U);
// I guess this should be the line that sums over all space-time sites.
auto Tp = sum(Plaq);
// Until now, we have been working with objects inside the tensor nest. This TensorRemove gets
// rid of the tensor nest to return whatever is inside.
auto p = TensorRemove(Tp);
return p.real();
}
// < Re tr U_mu_nu(x) >
static RealD avgPlaquette(const GaugeLorentz &Umu) {
// Real double type
RealD sumplaq = sumPlaquette(Umu);
// gSites() is the number of global sites. there is also lSites() for local sites.
double vol = Umu.Grid()->gSites();
// The number of orientations. 4*3/2=6 for Nd=4, as known.
double faces = (1.0 * Nd * (Nd - 1)) / 2.0;
return sumplaq / vol / faces / Nc;
}
};
// Next we show an example of how to construct an input parameter class. We first inherit
// from Serializable. Then all class data members have to be defined using the
// GRID_SERIALIZABLE_CLASS_MEMBERS macro. This variadic macro allows for arbitrarily many
// class data members. In the below case, we make a parameter file holding the configuration
// name. Here, it expects the name to be labeled with "conf_name" in the configuration file.
struct ConfParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(
ConfParameters,
std::string, conf_name);
template <class ReaderClass>
ConfParameters(Reader<ReaderClass>& Reader){
// If we are reading an XML file, it should be structured like:
// <grid>
// <parameters>
// <conf_name>l20t20b06498a_nersc.302500</conf_name>
// </parameters>
// </grid>
read(Reader, "parameters", *this);
}
};
// This syntax lets you pass command line arguments to main. An asterisk means that what follows is
// a pointer. Two asterisks means what follows is a pointer to an array.
int main (int argc, char **argv)
{
// This initializes Grid. Some command line options include
// --mpi n.n.n.n
// --threads n
// --grid n.n.n.n
Grid_init(&argc, &argv);
// This is where you would specify a custom lattice size, if not from the command line. Here
// Nd is a global quantity that is currently set to 4.
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
Coordinate latt_size = GridDefaultLatt();
// Instantiate the spacetime Grid on which everything will be built.
GridCartesian GRID(latt_size,simd_layout,mpi_layout);
// The PeriodicGimplD type is what you want for gauge matrices. There is also a LatticeGaugeFieldD
// type that you can use, which will work perfectly with what follows.
PeriodicGimplD::Field U(&GRID);
// Here we read in the parameter file params.json to get conf_name. The last argument is what the
// top organizational level is called in the param file.
XmlReader Reader("Example_plaquette.xml",false, "grid");
ConfParameters param(Reader);
// Load a lattice from SIMULATeQCD into U. SIMULATeQCD finds plaquette = 0.6381995717
FieldMetaData header;
NerscIO::readConfiguration(U, header, param.conf_name);
// Let's see what we find.
RealD plaq = WLoops<PeriodicGimplD>::avgPlaquette(U);
// This is how you make log messages.
std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1) << "Plaquette = " << plaq << std::endl;
// To wrap things up.
Grid_finalize();
}

View File

@ -0,0 +1,19 @@
--- ./Eigen/unsupported/Eigen/CXX11/Tensor 2018-07-23 10:33:42.000000000 +0100
+++ Tensor 2018-08-28 16:15:56.000000000 +0100
@@ -25,7 +25,7 @@
#include <utility>
#endif
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
#include "../SpecialFunctions"
#include "src/util/CXX11Meta.h"
@@ -147,6 +147,6 @@
#include "src/Tensor/TensorIO.h"
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
//#endif // EIGEN_CXX11_TENSOR_MODULE

View File

@ -25,16 +25,12 @@ export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
export FI_CXI_CQ_FILL_PERCENT=10
export FI_CXI_DEFAULT_CQ_SIZE=262144
#export FI_CXI_DEFAULT_CQ_SIZE=131072
#export FI_CXI_CQ_FILL_PERCENT=20
# 12 ppn, 32 nodes, 384 ranks
#
@ -49,12 +45,12 @@ CMD="mpiexec -np 12288 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 8.8.8.24 --grid 128.128.128.384 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 1024node.dwf.small.cq
$CMD | tee 1024node.dwf.small
CMD="mpiexec -np 12288 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 16.8.8.12 --grid 256.256.256.384 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
$CMD | tee 1024node.dwf.cq
$CMD | tee 1024node.dwf

View File

@ -17,7 +17,6 @@ source ../sourceme.sh
export OMP_NUM_THREADS=3
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
@ -36,25 +35,11 @@ CMD="mpiexec -np 24 -ppn 12 -envall \
./Benchmark_comms_host_device --mpi 2.3.2.2 --grid 32.24.32.192 \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
#$CMD
$CMD
CMD="mpiexec -np 24 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 2.3.2.2 --grid 64.96.64.64 --comms-overlap \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
#$CMD
CMD="mpiexec -np 1 -ppn 1 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
$CMD
CMD="mpiexec -np 1 -ppn 1 -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 --comms-sequential \
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32"
$CMD

View File

@ -11,6 +11,6 @@ TOOLS=$HOME/tools
--enable-unified=no \
MPICXX=mpicxx \
CXX=icpx \
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/ -L${MKLROOT}/lib -qmkl=parallel " \
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include -qmkl=parallel"
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$TOOLS/lib64/" \
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"

View File

@ -3,24 +3,10 @@
module use /soft/modulefiles
module load intel_compute_runtime/release/agama-devel-682.22
export FI_CXI_DEFAULT_CQ_SIZE=131072
export FI_CXI_CQ_FILL_PERCENT=20
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
#
# -ftarget-register-alloc-mode=pvc:default
# -ftarget-register-alloc-mode=pvc:small
# -ftarget-register-alloc-mode=pvc:large
# -ftarget-register-alloc-mode=pvc:auto
#
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
export http_proxy=http://proxy.alcf.anl.gov:3128
export https_proxy=http://proxy.alcf.anl.gov:3128
#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
git config --global http.proxy http://proxy.alcf.anl.gov:3128
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"

View File

@ -1,40 +0,0 @@
#!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -q EarlyAppAccess
#PBS -l select=16
#PBS -l walltime=01:00:00
#PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../sourceme.sh
cat $PBS_NODEFILE
export OMP_NUM_THREADS=3
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 16 nodes, 192 ranks
CMD="mpiexec -np 192 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Test_dwf_mixedcg_prec --mpi 2.4.4.6 --grid 64.128.128.192 \
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
$CMD

View File

@ -1,40 +0,0 @@
#!/bin/bash
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
#PBS -q EarlyAppAccess
#PBS -l select=16
#PBS -l walltime=01:00:00
#PBS -A LatticeQCD_aesp_CNDA
#export OMP_PROC_BIND=spread
#unset OMP_PLACES
cd $PBS_O_WORKDIR
source ../../sourceme.sh
cat $PBS_NODEFILE
export OMP_NUM_THREADS=3
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
export MPICH_OFI_NIC_POLICY=GPU
# 12 ppn, 16 nodes, 192 ranks
CMD="mpiexec -np 192 -ppn 12 -envall \
./gpu_tile_compact.sh \
./Test_staggered_cg_prec --mpi 2.4.4.6 --grid 128.128.128.192 \
--shm-mpi 1 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 3000"
$CMD

View File

@ -1,70 +0,0 @@
Memory Bandwidth
Bytes, GB/s per node
3145728, 225.900365
50331648, 2858.859504
254803968, 4145.556367
805306368, 4905.772480
1966080000, 4978.312557
GEMM
M, N, K, BATCH, GF/s per rank
16, 8, 16, 256, 1.713639
16, 16, 16, 256, 288.268316
16, 32, 16, 256, 597.053950
32, 8, 32, 256, 557.382591
32, 16, 32, 256, 1100.145311
32, 32, 32, 256, 1885.080449
64, 8, 64, 256, 1725.163599
64, 16, 64, 256, 3389.336566
64, 32, 64, 256, 4168.252422
16, 8, 256, 256, 1326.262134
16, 16, 256, 256, 2318.095475
16, 32, 256, 256, 3555.436503
32, 8, 256, 256, 1920.139170
32, 16, 256, 256, 3486.174753
32, 32, 256, 256, 5320.821724
64, 8, 256, 256, 2539.597502
64, 16, 256, 256, 5003.456775
64, 32, 256, 256, 7837.531562
8, 256, 16, 256, 1427.848170
16, 256, 16, 256, 2222.147815
32, 256, 16, 256, 2877.121715
8, 256, 32, 256, 1922.890086
16, 256, 32, 256, 3199.469082
32, 256, 32, 256, 4845.405343
8, 256, 64, 256, 2639.483343
16, 256, 64, 256, 5012.800299
32, 256, 64, 256, 7216.006882
Communications
Packet bytes, direction, GB/s per node
4718592, 2, 206.570734
4718592, 3, 207.501847
4718592, 6, 189.730277
4718592, 7, 204.301218
15925248, 2, 307.882997
15925248, 3, 287.901076
15925248, 6, 295.603109
15925248, 7, 300.682033
37748736, 2, 331.740364
37748736, 3, 338.610627
37748736, 6, 332.580657
37748736, 7, 336.336579
Per node summary table
L , Wilson, DWF4, Staggered, GF/s per node
8 , 16, 1165, 10
12 , 473, 4901, 163
16 , 1436, 8464, 442
24 , 4133, 10139, 1530
32 , 5726, 11487, 2518
1 Memory Bandwidth
2 Bytes, GB/s per node
3 3145728, 225.900365
4 50331648, 2858.859504
5 254803968, 4145.556367
6 805306368, 4905.772480
7 1966080000, 4978.312557
8 GEMM
9 M, N, K, BATCH, GF/s per rank
10 16, 8, 16, 256, 1.713639
11 16, 16, 16, 256, 288.268316
12 16, 32, 16, 256, 597.053950
13 32, 8, 32, 256, 557.382591
14 32, 16, 32, 256, 1100.145311
15 32, 32, 32, 256, 1885.080449
16 64, 8, 64, 256, 1725.163599
17 64, 16, 64, 256, 3389.336566
18 64, 32, 64, 256, 4168.252422
19 16, 8, 256, 256, 1326.262134
20 16, 16, 256, 256, 2318.095475
21 16, 32, 256, 256, 3555.436503
22 32, 8, 256, 256, 1920.139170
23 32, 16, 256, 256, 3486.174753
24 32, 32, 256, 256, 5320.821724
25 64, 8, 256, 256, 2539.597502
26 64, 16, 256, 256, 5003.456775
27 64, 32, 256, 256, 7837.531562
28 8, 256, 16, 256, 1427.848170
29 16, 256, 16, 256, 2222.147815
30 32, 256, 16, 256, 2877.121715
31 8, 256, 32, 256, 1922.890086
32 16, 256, 32, 256, 3199.469082
33 32, 256, 32, 256, 4845.405343
34 8, 256, 64, 256, 2639.483343
35 16, 256, 64, 256, 5012.800299
36 32, 256, 64, 256, 7216.006882
37 Communications
38 Packet bytes, direction, GB/s per node
39 4718592, 2, 206.570734
40 4718592, 3, 207.501847
41 4718592, 6, 189.730277
42 4718592, 7, 204.301218
43 15925248, 2, 307.882997
44 15925248, 3, 287.901076
45 15925248, 6, 295.603109
46 15925248, 7, 300.682033
47 37748736, 2, 331.740364
48 37748736, 3, 338.610627
49 37748736, 6, 332.580657
50 37748736, 7, 336.336579
51 Per node summary table
52 L , Wilson, DWF4, Staggered, GF/s per node
53 8 , 16, 1165, 10
54 12 , 473, 4901, 163
55 16 , 1436, 8464, 442
56 24 , 4133, 10139, 1530
57 32 , 5726, 11487, 2518

View File

@ -5,12 +5,10 @@ LIME=/p/home/jusers/boyle2/juwels/gm2dwf/boyle/
--enable-gen-simd-width=64 \
--enable-shm=nvlink \
--enable-accelerator=cuda \
--disable-gparity \
--disable-fermion-reps \
--with-lime=$LIME \
--enable-accelerator-cshift \
--disable-accelerator-cshift \
--disable-unified \
CXX=nvcc \
LDFLAGS="-cudart shared " \
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared -lcublas"
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++14 -cudart shared"

View File

@ -1,5 +1,5 @@
module load GCC
module load GMP
module load MPFR
module load OpenMPI
module load CUDA
module load GCC/9.3.0
module load GMP/6.2.0
module load MPFR/4.1.0
module load OpenMPI/4.1.0rc1
module load CUDA/11.3

View File

@ -16,7 +16,7 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -fgpu-sanitize" \
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 "

View File

@ -1,5 +1,3 @@
export https_proxy=http://proxy-chain.intel.com:911
module load intel-release
module load intel/mpich
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"

View File

@ -1,3 +1,4 @@
CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ CXX=c++-13 MPICXX=mpicxx ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug
BREW=/opt/local/
MPICXX=mpicxx ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug

View File

@ -30,20 +30,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
using namespace std;
using namespace Grid;
#ifndef HOST_NAME_MAX
#define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
#endif
template<class d>
struct scal {
d internal;
};
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
int main (int argc, char ** argv)
{
char hostname[HOST_NAME_MAX+1];
gethostname(hostname, HOST_NAME_MAX+1);
std::string host(hostname);
Grid_init(&argc,&argv);
const int Ls=12;
std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
{
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
@ -85,14 +92,7 @@ int main (int argc, char ** argv)
SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
int nsecs=600;
if( GridCmdOptionExists(argv,argv+argc,"--seconds") ){
std::string arg = GridCmdOptionPayload(argv,argv+argc,"--seconds");
GridCmdOptionInt(arg,nsecs);
}
std::cout << GridLogMessage << "::::::::::::: Starting mixed CG for "<<nsecs <<" seconds" << std::endl;
std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
double t1,t2,flops;
double MdagMsiteflops = 1452; // Mobius (real coeffs)
@ -101,14 +101,7 @@ int main (int argc, char ** argv)
std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
std:: cout << " CG site flops = "<< CGsiteflops <<std::endl;
int iters;
time_t start = time(NULL);
uint32_t csum, csumref;
csumref=0;
int iter=0;
do {
std::cerr << "******************* SINGLE PRECISION SOLVE "<<iter<<std::endl;
for(int i=0;i<10;i++){
result_o = Zero();
t1=usecond();
mCG(src_o,result_o);
@ -118,28 +111,10 @@ int main (int argc, char ** argv)
flops+= CGsiteflops*FrbGrid->gSites()*iters;
std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
csum = crc(result_o);
if ( csumref == 0 ) {
csumref = csum;
} else {
if ( csum != csumref ) {
std::cerr << host<<" FAILURE " <<iter <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
assert(0);
} else {
std::cout << host <<" OK " <<iter <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
}
}
iter ++;
} while (time(NULL) < (start + nsecs/2) );
std::cout << GridLogMessage << "::::::::::::: Starting double precision CG" << std::endl;
}
std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
csumref=0;
int i=0;
do {
std::cerr << "******************* DOUBLE PRECISION SOLVE "<<i<<std::endl;
for(int i=0;i<1;i++){
result_o_2 = Zero();
t1=usecond();
CG(HermOpEO,src_o,result_o_2);
@ -147,30 +122,46 @@ int main (int argc, char ** argv)
iters = CG.IterationsToComplete;
flops = MdagMsiteflops*4*FrbGrid->gSites()*iters;
flops+= CGsiteflops*FrbGrid->gSites()*iters;
std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
csum = crc(result_o);
if ( csumref == 0 ) {
csumref = csum;
} else {
if ( csum != csumref ) {
std::cerr << i <<" csum "<<std::hex<<csum<< " != "<<csumref <<std::dec<<std::endl;
assert(0);
} else {
std::cout << i <<" csum "<<std::hex<<csum<<std::dec<<" -- OK! "<<std::endl;
}
}
i++;
} while (time(NULL) < (start + nsecs) );
}
// MemoryManager::Print();
LatticeFermionD diff_o(FrbGrid);
RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
assert(diff < 1e-4);
#ifdef HAVE_LIME
if( GridCmdOptionExists(argv,argv+argc,"--checksums") ){
std::string file1("./Propagator1");
emptyUserRecord record;
uint32_t nersc_csum;
uint32_t scidac_csuma;
uint32_t scidac_csumb;
typedef SpinColourVectorD FermionD;
typedef vSpinColourVectorD vFermionD;
BinarySimpleMunger<FermionD,FermionD> munge;
std::string format = getFormatString<vFermionD>();
BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o,file1,munge, 0, format,
nersc_csum,scidac_csuma,scidac_csumb);
std::cout << GridLogMessage << " Mixed checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
BinaryIO::writeLatticeObject<vFermionD,FermionD>(result_o_2,file1,munge, 0, format,
nersc_csum,scidac_csuma,scidac_csumb);
std::cout << GridLogMessage << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
}
#endif
}
MemoryManager::Print();
Grid_finalize();
}

View File

@ -257,65 +257,7 @@ int main (int argc, char ** argv) {
}
traceStop(trace_id);
LatticeSpinColourMatrixD test_data_scm(&Grid);
gaussian(pRNG,test_data_scm);
std::vector<SpinColourMatrixD> reduction_reference_scm;
std::vector<SpinColourMatrixD> reduction_result_scm;
//warmup
for (int sweeps = 0; sweeps < 5; sweeps++) {
reduction_result_scm = sliceSum(test_data_scm,0);
}
trace_id = traceStart("sliceSum benchmark - SpinColourMatrixD");
std::cout << GridLogMessage << "Testing SpinColourMatrixD" << std::endl;
std::cout << GridLogMessage << "sizeof(SpinColourMatrixD) = " << sizeof(SpinColourMatrixD) << std::endl;
std::cout << GridLogMessage << "sizeof(vSpinColourMatrixD) = " << sizeof(vSpinColourMatrixD) << std::endl;
for (int i = 0; i < Nd; i++) {
RealD t=-usecond();
tracePush("sliceSum");
sliceSumCPU(test_data_scm,reduction_reference_scm,i);
tracePop("sliceSum");
t+=usecond();
std::cout << GridLogMessage << "Orthog. dir. = " << i << std::endl;
std::cout << GridLogMessage << "CPU sliceSum took "<<t<<" usecs"<<std::endl;
RealD tgpu=-usecond();
tracePush("sliceSumGpu");
reduction_result_scm = sliceSum(test_data_scm,i);
tracePop("sliceSumGpu");
tgpu+=usecond();
std::cout << GridLogMessage <<"GPU sliceSum took "<<tgpu<<" usecs"<<std::endl<<std::endl;;
for(int t=0;t<reduction_reference_scm.size();t++) {
auto diff = reduction_reference_scm[t]-reduction_result_scm[t];
// std::cout << diff <<std::endl;
for (int is = 0; is < Ns; is++) {
for (int js = 0; js < Ns; js++) {
for (int ic = 0; ic < Nc; ic++) {
for (int jc = 0; jc < Nc; jc++) {
assert(abs(diff()(is,js)(ic,jc)) < 1e-8);
}
}
}
}
}
}
traceStop(trace_id);
Grid_finalize();
return 0;
}

View File

@ -32,7 +32,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
using namespace std;
using namespace Grid;
// This is to optimize the SIMD
template<class vobj> void gpermute(vobj & inout,int perm){
vobj tmp=inout;
if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;}
@ -40,8 +39,7 @@ template<class vobj> void gpermute(vobj & inout,int perm){
if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;}
if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;}
}
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
@ -49,21 +47,20 @@ int main (int argc, char ** argv)
Coordinate latt_size = GridDefaultLatt();
Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
std::cout << GridLogMessage << " mpi "<<mpi_layout<<std::endl;
std::cout << GridLogMessage << " simd "<<simd_layout<<std::endl;
std::cout << GridLogMessage << " latt "<<latt_size<<std::endl;
std::cout << " mpi "<<mpi_layout<<std::endl;
std::cout << " simd "<<simd_layout<<std::endl;
std::cout << " latt "<<latt_size<<std::endl;
GridCartesian GRID(latt_size,simd_layout,mpi_layout);
// Initialize configuration as hot start.
GridParallelRNG pRNG(&GRID);
LatticeGaugeField Umu(&GRID);
pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
LatticeGaugeField Umu(&GRID);
SU<Nc>::HotConfiguration(pRNG,Umu);
Real plaq=WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu);
LatticeComplex trplaq(&GRID);
// Store Umu in U. Peek/Poke mean respectively getElement/setElement.
std::vector<LatticeColourMatrix> U(Nd, Umu.Grid());
for (int mu = 0; mu < Nd; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
@ -73,7 +70,9 @@ int main (int argc, char ** argv)
LatticeComplex cplaq(&GRID); cplaq=Zero();
/////////////////////////////////////////////////
// Create a padded cell of extra padding depth=1
/////////////////////////////////////////////////
int depth = 1;
PaddedCell Ghost(depth,&GRID);
LatticeGaugeField Ughost = Ghost.Exchange(Umu);
@ -115,25 +114,18 @@ int main (int argc, char ** argv)
}
#endif
// Array for the site plaquette
///// Array for the site plaquette
GridBase *GhostGrid = Ughost.Grid();
LatticeComplex gplaq(GhostGrid);
// Now we're going to put together the "stencil" that will be useful to us when
// calculating the plaquette. Our eventual goal is to make the product
// Umu(x) Unu(x+mu) Umu^dag(x+nu) Unu^dag(x),
// which requires, in order, the sites x, x+mu, x+nu, and x. We arrive at these
// sites relative to x through "shifts", which is represented here by a 4-d
// vector of 0s (no movement) and 1s (shift one unit) at each site. The
// "stencil" is the set of all these shifts.
std::vector<Coordinate> shifts;
for(int mu=0;mu<Nd;mu++){
for(int nu=mu+1;nu<Nd;nu++){
// Umu(x) Unu(x+mu) Umu^dag(x+nu) Unu^dag(x)
Coordinate shift_0(Nd,0);
Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
// push_back creates an element at the end of shifts and
// assigns the data in the argument to it.
shifts.push_back(shift_0);
shifts.push_back(shift_mu);
shifts.push_back(shift_nu);
@ -143,51 +135,41 @@ int main (int argc, char ** argv)
GeneralLocalStencil gStencil(GhostGrid,shifts);
gplaq=Zero();
{
autoView( gp_v , gplaq, CpuWrite);
autoView( t_v , trplaq, CpuRead);
autoView( U_v , Ughost, CpuRead);
for(int ss=0;ss<gp_v.size();ss++){
int s=0;
for(int mu=0;mu<Nd;mu++){
for(int nu=mu+1;nu<Nd;nu++){
// Before doing accelerator stuff, there is an opening and closing of "Views". I guess the
// "Views" are stored in *_v variables listed below.
autoView( gp_v , gplaq, CpuWrite);
autoView( t_v , trplaq, CpuRead);
autoView( U_v , Ughost, CpuRead);
auto SE0 = gStencil.GetEntry(s+0,ss);
auto SE1 = gStencil.GetEntry(s+1,ss);
auto SE2 = gStencil.GetEntry(s+2,ss);
auto SE3 = gStencil.GetEntry(s+3,ss);
int o0 = SE0->_offset;
int o1 = SE1->_offset;
int o2 = SE2->_offset;
int o3 = SE3->_offset;
auto U0 = U_v[o0](mu);
auto U1 = U_v[o1](nu);
auto U2 = adj(U_v[o2](mu));
auto U3 = adj(U_v[o3](nu));
// This is now a loop over stencil shift elements. That is, s increases as we make our
// way through the spacetimes sites, but also as we make our way around the plaquette.
for(int ss=0;ss<gp_v.size();ss++){
int s=0;
for(int mu=0;mu<Nd;mu++){
for(int nu=mu+1;nu<Nd;nu++){
auto SE0 = gStencil.GetEntry(s+0,ss);
auto SE1 = gStencil.GetEntry(s+1,ss);
auto SE2 = gStencil.GetEntry(s+2,ss);
auto SE3 = gStencil.GetEntry(s+3,ss);
// Due to our strategy, each offset corresponds to a site.
int o0 = SE0->_offset;
int o1 = SE1->_offset;
int o2 = SE2->_offset;
int o3 = SE3->_offset;
auto U0 = U_v[o0](mu);
auto U1 = U_v[o1](nu);
auto U2 = adj(U_v[o2](mu));
auto U3 = adj(U_v[o3](nu));
gpermute(U0,SE0->_permute);
gpermute(U1,SE1->_permute);
gpermute(U2,SE2->_permute);
gpermute(U3,SE3->_permute);
gp_v[ss]() =gp_v[ss]() + trace( U0*U1*U2*U3 );
s=s+4;
}
gpermute(U0,SE0->_permute);
gpermute(U1,SE1->_permute);
gpermute(U2,SE2->_permute);
gpermute(U3,SE3->_permute);
gp_v[ss]() =gp_v[ss]() + trace( U0*U1*U2*U3 );
s=s+4;
}
}
}
}
// Here is my understanding of this part: The padded cell has its own periodic BCs, so
// if I take a step to the right at the right-most side of the cell, I end up on the
// left-most side. This means that the plaquettes in the padding are wrong. Luckily
// all we care about are the plaquettes in the cell, which we obtain from Extract.
cplaq = Ghost.Extract(gplaq);
RealD vol = cplaq.Grid()->gSites();
RealD faces = (Nd * (Nd-1))/2;

View File

@ -1,181 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/smearing/Test_fatLinks.cc
Copyright (C) 2023
Author: D. A. Clarke <clarke.davida@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/*
@file Test_fatLinks.cc
@brief test of the HISQ smearing
*/
#include <Grid/Grid.h>
#include <Grid/lattice/PaddedCell.h>
#include <Grid/stencil/GeneralLocalStencil.h>
#include <Grid/qcd/smearing/HISQSmearing.h>
using namespace Grid;
/*! @brief parameter file to easily adjust Nloop */
struct ConfParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(
ConfParameters,
int, benchmark,
int, Nloop);
template <class ReaderClass>
ConfParameters(Reader<ReaderClass>& Reader){
read(Reader, "parameters", *this);
}
};
bool testSmear(GridCartesian& GRID, LatticeGaugeFieldD Umu, LatticeGaugeFieldD Usmr, LatticeGaugeFieldD Unaik,
LatticeGaugeFieldD Ucontrol, Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp) {
Smear_HISQ<PeriodicGimplD> hisq_fat(&GRID,c1,cnaik,c3,c5,c7,clp);
LatticeGaugeFieldD diff(&GRID), Uproj(&GRID);
hisq_fat.smear(Usmr, Unaik, Umu);
bool result;
if (cnaik < 1e-30) { // Testing anything but Naik term
diff = Ucontrol-Usmr;
auto absDiff = norm2(diff)/norm2(Ucontrol);
if (absDiff < 1e-30) {
Grid_pass(" |Umu-Usmr|/|Umu| = ",absDiff);
result = true;
} else {
Grid_error(" |Umu-Usmr|/|Umu| = ",absDiff);
result = false;
}
} else { // Testing Naik specifically
diff = Ucontrol-Unaik;
auto absDiff = norm2(diff)/norm2(Ucontrol);
if (absDiff < 1e-30) {
Grid_pass(" |Umu-Unaik|/|Umu| = ",absDiff);
result = true;
} else {
Grid_error(" |Umu-Unaik|/|Umu| = ",absDiff);
result = false;
}
hisq_fat.projectU3(Uproj,Ucontrol);
// NerscIO::writeConfiguration(Unaik,"nersc.l8t4b3360.naik");
}
return result;
}
int main (int argc, char** argv) {
// Params for the test.
int Ns = 8;
int Nt = 4;
Coordinate latt_size(Nd,0); latt_size[0]=Ns; latt_size[1]=Ns; latt_size[2]=Ns; latt_size[3]=Nt;
std::string conf_in = "nersc.l8t4b3360";
int threads = GridThread::GetThreads();
typedef LatticeGaugeFieldD LGF;
// Initialize the Grid
Grid_init(&argc,&argv);
Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
Coordinate mpi_layout = GridDefaultMpi();
Grid_log("mpi = ",mpi_layout);
Grid_log("simd = ",simd_layout);
Grid_log("latt = ",latt_size);
Grid_log("threads = ",threads);
GridCartesian GRID(latt_size,simd_layout,mpi_layout);
XmlReader Reader("fatParams.xml",false,"grid");
ConfParameters param(Reader);
if(param.benchmark) Grid_log(" Nloop = ",param.Nloop);
LGF Umu(&GRID), Usmr(&GRID), Unaik(&GRID), Ucontrol(&GRID);
// Read the configuration into Umu
FieldMetaData header;
NerscIO::readConfiguration(Umu, header, conf_in);
bool pass=true;
// Carry out various tests
NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.357lplink.control");
pass *= testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,1/64.,1/384.,-1/8.);
NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.357link.control");
pass *= testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,1/64.,1/384.,0.);
NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.35link.control");
pass *= testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,1/64.,0.,0.);
NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.3link.control");
pass *= testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,1/8.,0.,1/16.,0.,0.,0.);
NerscIO::readConfiguration(Ucontrol, header, "nersc.l8t4b3360.naik.control");
pass *= testSmear(GRID,Umu,Usmr,Unaik,Ucontrol,0.,0.8675309,0.,0.,0.,0.);
if(pass){
Grid_pass("All tests passed.");
} else {
Grid_error("At least one test failed.");
}
// Test a C-style instantiation
double path_coeff[6] = {1, 2, 3, 4, 5, 6};
Smear_HISQ<PeriodicGimplD> hisq_fat_Cstyle(&GRID,path_coeff);
if (param.benchmark) {
autoView(U_v, Umu, CpuRead); // Gauge accessor
// Read in lattice sequentially, Nloop times
double lookupTime = 0.;
for(int i=0;i<param.Nloop;i++) {
double start = usecond();
for(int ss=0;ss<U_v.size();ss++)
for(int mu=0;mu<Nd;mu++) {
auto U1 = U_v[ss](mu);
}
double stop = usecond();
lookupTime += stop-start; // microseconds
}
Grid_log("Time to lookup: ",lookupTime,"[ms]");
// Raise a matrix to the power nmat, for each link.
auto U1 = U_v[0](0);
for(int nmat=1;nmat<8;nmat++) {
double multTime = 0.;
for(int i=0;i<param.Nloop;i++) {
double start=usecond();
for(int ss=0;ss<U_v.size();ss++)
for(int mu=0;mu<Nd;mu++) {
auto U2 = U1;
for(int j=1;j<nmat;j++) {
U2 *= U1;
}
}
double stop=usecond();
multTime += stop-start;
}
Grid_log("Time to multiply ",nmat," matrices: ",multTime," [ms]");
}
}
Grid_finalize();
}