1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-14 13:57:07 +01:00

Compare commits

..

3 Commits

Author SHA1 Message Date
12d20d8e15 Merge branch 'release/0.10.0' 2023-03-29 16:35:33 -04:00
25777e5967 Merge branch 'release/0.9.0' 2023-03-29 15:27:58 -04:00
deab11e68b Flop cout matches DiRAC-ITT-2020 2020-11-16 17:15:34 +01:00
232 changed files with 2609 additions and 15770 deletions

View File

@ -1,54 +0,0 @@
name: Bug report
description: Report a bug.
title: "<insert title>"
labels: [bug]
body:
- type: markdown
attributes:
value: >
Thank you for taking the time to file a bug report.
Please check that the code is pointing to the HEAD of develop
or any commit in master which is tagged with a version number.
- type: textarea
attributes:
label: "Describe the issue:"
description: >
Describe the issue and any previous attempt to solve it.
validations:
required: true
- type: textarea
attributes:
label: "Code example:"
description: >
If relevant, show how to reproduce the issue using a minimal working
example.
placeholder: |
<< your code here >>
render: shell
validations:
required: false
- type: textarea
attributes:
label: "Target platform:"
description: >
Give a description of the target platform (CPU, network, compiler).
Please give the full CPU part description, using for example
`cat /proc/cpuinfo | grep 'model name' | uniq` (Linux)
or `sysctl machdep.cpu.brand_string` (macOS) and the full output
the `--version` option of your compiler.
validations:
required: true
- type: textarea
attributes:
label: "Configure options:"
description: >
Please give the exact configure command used and attach
`config.log`, `grid.config.summary` and the output of `make V=1`.
render: shell
validations:
required: true

4
.gitignore vendored
View File

@ -1,7 +1,3 @@
# Doxygen stuff
html/*
latex/*
# Compiled Object files #
#########################
*.slo

View File

@ -34,7 +34,7 @@
#pragma push_macro("__SYCL_DEVICE_ONLY__")
#undef __SYCL_DEVICE_ONLY__
#define EIGEN_DONT_VECTORIZE
#undef EIGEN_USE_SYCL
//#undef EIGEN_USE_SYCL
#define __SYCL__REDEFINE__
#endif

View File

@ -66,10 +66,6 @@ if BUILD_FERMION_REPS
extra_sources+=$(ADJ_FERMION_FILES)
extra_sources+=$(TWOIND_FERMION_FILES)
endif
if BUILD_SP
extra_sources+=$(SP_FERMION_FILES)
extra_sources+=$(SP_TWOIND_FERMION_FILES)
endif
lib_LIBRARIES = libGrid.a

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#define _GRID_FFT_H_
#ifdef HAVE_FFTW
#if defined(USE_MKL) || defined(GRID_SYCL)
#ifdef USE_MKL
#include <fftw/fftw3.h>
#else
#include <fftw3.h>

View File

@ -542,7 +542,6 @@ public:
(*this)(in[i], out[i]);
}
}
virtual ~LinearFunction(){};
};
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {

View File

@ -293,7 +293,7 @@ static void sncndnFK(INTERNAL_PRECISION u, INTERNAL_PRECISION k,
* Set type = 0 for the Zolotarev approximation, which is zero at x = 0, and
* type = 1 for the approximation which is infinite at x = 0. */
zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
zolotarev_data* zolotarev(PRECISION epsilon, int n, int type) {
INTERNAL_PRECISION A, c, cp, kp, ksq, sn, cn, dn, Kp, Kj, z, z0, t, M, F,
l, invlambda, xi, xisq, *tv, s, opl;
int m, czero, ts;
@ -375,12 +375,12 @@ zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
construct_partfrac(d);
construct_contfrac(d);
/* Converting everything to ZOLO_PRECISION for external use only */
/* Converting everything to PRECISION for external use only */
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
zd -> A = (ZOLO_PRECISION) d -> A;
zd -> Delta = (ZOLO_PRECISION) d -> Delta;
zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
zd -> A = (PRECISION) d -> A;
zd -> Delta = (PRECISION) d -> Delta;
zd -> epsilon = (PRECISION) d -> epsilon;
zd -> n = d -> n;
zd -> type = d -> type;
zd -> dn = d -> dn;
@ -390,24 +390,24 @@ zolotarev_data* zolotarev(ZOLO_PRECISION epsilon, int n, int type) {
zd -> deg_num = d -> deg_num;
zd -> deg_denom = d -> deg_denom;
zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
free(d -> a);
zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
free(d -> ap);
zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
free(d -> alpha);
zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
free(d -> beta);
zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
free(d -> gamma);
free(d);
@ -426,7 +426,7 @@ void zolotarev_free(zolotarev_data *zdata)
}
zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
zolotarev_data* higham(PRECISION epsilon, int n) {
INTERNAL_PRECISION A, M, c, cp, z, z0, t, epssq;
int m, czero;
zolotarev_data *zd;
@ -481,9 +481,9 @@ zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
/* Converting everything to PRECISION for external use only */
zd = (zolotarev_data*) malloc(sizeof(zolotarev_data));
zd -> A = (ZOLO_PRECISION) d -> A;
zd -> Delta = (ZOLO_PRECISION) d -> Delta;
zd -> epsilon = (ZOLO_PRECISION) d -> epsilon;
zd -> A = (PRECISION) d -> A;
zd -> Delta = (PRECISION) d -> Delta;
zd -> epsilon = (PRECISION) d -> epsilon;
zd -> n = d -> n;
zd -> type = d -> type;
zd -> dn = d -> dn;
@ -493,24 +493,24 @@ zolotarev_data* higham(ZOLO_PRECISION epsilon, int n) {
zd -> deg_num = d -> deg_num;
zd -> deg_denom = d -> deg_denom;
zd -> a = (ZOLO_PRECISION*) malloc(zd -> dn * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (ZOLO_PRECISION) d -> a[m];
zd -> a = (PRECISION*) malloc(zd -> dn * sizeof(PRECISION));
for (m = 0; m < zd -> dn; m++) zd -> a[m] = (PRECISION) d -> a[m];
free(d -> a);
zd -> ap = (ZOLO_PRECISION*) malloc(zd -> dd * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (ZOLO_PRECISION) d -> ap[m];
zd -> ap = (PRECISION*) malloc(zd -> dd * sizeof(PRECISION));
for (m = 0; m < zd -> dd; m++) zd -> ap[m] = (PRECISION) d -> ap[m];
free(d -> ap);
zd -> alpha = (ZOLO_PRECISION*) malloc(zd -> da * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (ZOLO_PRECISION) d -> alpha[m];
zd -> alpha = (PRECISION*) malloc(zd -> da * sizeof(PRECISION));
for (m = 0; m < zd -> da; m++) zd -> alpha[m] = (PRECISION) d -> alpha[m];
free(d -> alpha);
zd -> beta = (ZOLO_PRECISION*) malloc(zd -> db * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (ZOLO_PRECISION) d -> beta[m];
zd -> beta = (PRECISION*) malloc(zd -> db * sizeof(PRECISION));
for (m = 0; m < zd -> db; m++) zd -> beta[m] = (PRECISION) d -> beta[m];
free(d -> beta);
zd -> gamma = (ZOLO_PRECISION*) malloc(zd -> n * sizeof(ZOLO_PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (ZOLO_PRECISION) d -> gamma[m];
zd -> gamma = (PRECISION*) malloc(zd -> n * sizeof(PRECISION));
for (m = 0; m < zd -> n; m++) zd -> gamma[m] = (PRECISION) d -> gamma[m];
free(d -> gamma);
free(d);
@ -523,17 +523,17 @@ NAMESPACE_END(Grid);
#ifdef TEST
#undef ZERO
#define ZERO ((ZOLO_PRECISION) 0)
#define ZERO ((PRECISION) 0)
#undef ONE
#define ONE ((ZOLO_PRECISION) 1)
#define ONE ((PRECISION) 1)
#undef TWO
#define TWO ((ZOLO_PRECISION) 2)
#define TWO ((PRECISION) 2)
/* Evaluate the rational approximation R(x) using the factored form */
static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
static PRECISION zolotarev_eval(PRECISION x, zolotarev_data* rdata) {
int m;
ZOLO_PRECISION R;
PRECISION R;
if (rdata -> type == 0) {
R = rdata -> A * x;
@ -551,9 +551,9 @@ static ZOLO_PRECISION zolotarev_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
/* Evaluate the rational approximation R(x) using the partial fraction form */
static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
static PRECISION zolotarev_partfrac_eval(PRECISION x, zolotarev_data* rdata) {
int m;
ZOLO_PRECISION R = rdata -> alpha[rdata -> da - 1];
PRECISION R = rdata -> alpha[rdata -> da - 1];
for (m = 0; m < rdata -> dd; m++)
R += rdata -> alpha[m] / (x * x - rdata -> ap[m]);
if (rdata -> type == 1) R += rdata -> alpha[rdata -> dd] / (x * x);
@ -568,18 +568,18 @@ static ZOLO_PRECISION zolotarev_partfrac_eval(ZOLO_PRECISION x, zolotarev_data*
* non-signalling overflow this will work correctly since 1/(1/0) = 1/INF = 0,
* but with signalling overflow you will get an error message. */
static ZOLO_PRECISION zolotarev_contfrac_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
static PRECISION zolotarev_contfrac_eval(PRECISION x, zolotarev_data* rdata) {
int m;
ZOLO_PRECISION R = rdata -> beta[0] * x;
PRECISION R = rdata -> beta[0] * x;
for (m = 1; m < rdata -> db; m++) R = rdata -> beta[m] * x + ONE / R;
return R;
}
/* Evaluate the rational approximation R(x) using Cayley form */
static ZOLO_PRECISION zolotarev_cayley_eval(ZOLO_PRECISION x, zolotarev_data* rdata) {
static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
int m;
ZOLO_PRECISION T;
PRECISION T;
T = rdata -> type == 0 ? ONE : -ONE;
for (m = 0; m < rdata -> n; m++)
@ -607,7 +607,7 @@ int main(int argc, char** argv) {
int m, n, plotpts = 5000, type = 0;
float eps, x, ypferr, ycferr, ycaylerr, maxypferr, maxycferr, maxycaylerr;
zolotarev_data *rdata;
ZOLO_PRECISION y;
PRECISION y;
FILE *plot_function, *plot_error,
*plot_partfrac, *plot_contfrac, *plot_cayley;
@ -626,13 +626,13 @@ int main(int argc, char** argv) {
}
rdata = type == 2
? higham((ZOLO_PRECISION) eps, n)
: zolotarev((ZOLO_PRECISION) eps, n, type);
? higham((PRECISION) eps, n)
: zolotarev((PRECISION) eps, n, type);
printf("Zolotarev Test: R(epsilon = %g, n = %d, type = %d)\n\t"
STRINGIFY(VERSION) "\n\t" STRINGIFY(HVERSION)
"\n\tINTERNAL_PRECISION = " STRINGIFY(INTERNAL_PRECISION)
"\tZOLO_PRECISION = " STRINGIFY(ZOLO_PRECISION)
"\tPRECISION = " STRINGIFY(PRECISION)
"\n\n\tRational approximation of degree (%d,%d), %s at x = 0\n"
"\tDelta = %g (maximum error)\n\n"
"\tA = %g (overall factor)\n",
@ -681,15 +681,15 @@ int main(int argc, char** argv) {
x = 2.4 * (float) m / plotpts - 1.2;
if (rdata -> type == 0 || fabs(x) * (float) plotpts > 1.0) {
/* skip x = 0 for type 1, as R(0) is singular */
y = zolotarev_eval((ZOLO_PRECISION) x, rdata);
y = zolotarev_eval((PRECISION) x, rdata);
fprintf(plot_function, "%g %g\n", x, (float) y);
fprintf(plot_error, "%g %g\n",
x, (float)((y - ((x > 0.0 ? ONE : -ONE))) / rdata -> Delta));
ypferr = (float)((zolotarev_partfrac_eval((ZOLO_PRECISION) x, rdata) - y)
ypferr = (float)((zolotarev_partfrac_eval((PRECISION) x, rdata) - y)
/ rdata -> Delta);
ycferr = (float)((zolotarev_contfrac_eval((ZOLO_PRECISION) x, rdata) - y)
ycferr = (float)((zolotarev_contfrac_eval((PRECISION) x, rdata) - y)
/ rdata -> Delta);
ycaylerr = (float)((zolotarev_cayley_eval((ZOLO_PRECISION) x, rdata) - y)
ycaylerr = (float)((zolotarev_cayley_eval((PRECISION) x, rdata) - y)
/ rdata -> Delta);
if (fabs(x) < 1.0 && fabs(x) > rdata -> epsilon) {
maxypferr = MAX(maxypferr, fabs(ypferr));

View File

@ -9,10 +9,10 @@ NAMESPACE_BEGIN(Approx);
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
#ifndef ZOLOTAREV_INTERNAL
#ifndef ZOLO_PRECISION
#define ZOLO_PRECISION double
#ifndef PRECISION
#define PRECISION double
#endif
#define ZPRECISION ZOLO_PRECISION
#define ZPRECISION PRECISION
#define ZOLOTAREV_DATA zolotarev_data
#endif
@ -77,8 +77,8 @@ typedef struct {
* zolotarev_data structure. The arguments must satisfy the constraints that
* epsilon > 0, n > 0, and type = 0 or 1. */
ZOLOTAREV_DATA* higham(ZOLO_PRECISION epsilon, int n) ;
ZOLOTAREV_DATA* zolotarev(ZOLO_PRECISION epsilon, int n, int type);
ZOLOTAREV_DATA* higham(PRECISION epsilon, int n) ;
ZOLOTAREV_DATA* zolotarev(PRECISION epsilon, int n, int type);
void zolotarev_free(zolotarev_data *zdata);
#endif
@ -86,4 +86,3 @@ void zolotarev_free(zolotarev_data *zdata);
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#endif

View File

@ -1,34 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: BatchedBlas.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/algorithms/blas/BatchedBlas.h>
NAMESPACE_BEGIN(Grid);
gridblasHandle_t GridBLAS::gridblasHandle;
int GridBLAS::gridblasInit;
NAMESPACE_END(Grid);

View File

@ -1,727 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: BatchedBlas.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#ifdef GRID_HIP
#include <hipblas/hipblas.h>
#endif
#ifdef GRID_CUDA
#include <cublas_v2.h>
#endif
#ifdef GRID_SYCL
#include <oneapi/mkl.hpp>
#endif
#if 0
#define GRID_ONE_MKL
#endif
#ifdef GRID_ONE_MKL
#include <oneapi/mkl.hpp>
#endif
///////////////////////////////////////////////////////////////////////
// Need to rearrange lattice data to be in the right format for a
// batched multiply. Might as well make these static, dense packed
///////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
#ifdef GRID_HIP
typedef hipblasHandle_t gridblasHandle_t;
#endif
#ifdef GRID_CUDA
typedef cublasHandle_t gridblasHandle_t;
#endif
#ifdef GRID_SYCL
typedef cl::sycl::queue *gridblasHandle_t;
#endif
#ifdef GRID_ONE_MKL
typedef cl::sycl::queue *gridblasHandle_t;
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
typedef int32_t gridblasHandle_t;
#endif
enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
class GridBLAS {
public:
static gridblasHandle_t gridblasHandle;
static int gridblasInit;
static void Init(void)
{
if ( ! gridblasInit ) {
#ifdef GRID_CUDA
std::cout << "cublasCreate"<<std::endl;
cublasCreate(&gridblasHandle);
cublasSetPointerMode(gridblasHandle, CUBLAS_POINTER_MODE_DEVICE);
#endif
#ifdef GRID_HIP
std::cout << "hipblasCreate"<<std::endl;
hipblasCreate(&gridblasHandle);
#endif
#ifdef GRID_SYCL
gridblasHandle = theGridAccelerator;
#endif
#ifdef GRID_ONE_MKL
cl::sycl::cpu_selector selector;
cl::sycl::device selectedDevice { selector };
gridblasHandle =new sycl::queue (selectedDevice);
#endif
gridblasInit=1;
}
}
// Force construct once
GridBLAS() { Init(); };
~GridBLAS() { };
/////////////////////////////////////////////////////////////////////////////////////
// BLAS GEMM conventions:
/////////////////////////////////////////////////////////////////////////////////////
// - C = alpha A * B + beta C
// Dimensions:
// - C_m.n
// - A_m.k
// - B_k.n
// - Flops = 8 M N K
// - Bytes = 2*sizeof(word) * (MN+MK+KN)
// M=60, N=12
// Flop/Byte = 8 . 60.60.12 / (60.12+60.60+60.12)/16 = 4 so expect about 4 TF/s on a GCD
/////////////////////////////////////////////////////////////////////////////////////
void synchronise(void)
{
#ifdef GRID_HIP
auto err = hipDeviceSynchronize();
assert(err==hipSuccess);
#endif
#ifdef GRID_CUDA
auto err = cudaDeviceSynchronize();
assert(err==cudaSuccess);
#endif
#ifdef GRID_SYCL
accelerator_barrier();
#endif
#ifdef GRID_ONE_MKL
gridblasHandle->wait();
#endif
}
void gemmBatched(int m,int n, int k,
ComplexD alpha,
deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn,
ComplexD beta,
deviceVector<ComplexD*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(int m,int n, int k,
ComplexF alpha,
deviceVector<ComplexF*> &Amk, // pointer list to matrices
deviceVector<ComplexF*> &Bkn,
ComplexF beta,
deviceVector<ComplexF*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(int m,int n, int k,
RealD alpha,
deviceVector<RealD*> &Amk, // pointer list to matrices
deviceVector<RealD*> &Bkn,
RealD beta,
deviceVector<RealD*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(int m,int n, int k,
RealF alpha,
deviceVector<RealF*> &Amk, // pointer list to matrices
deviceVector<RealF*> &Bkn,
RealF beta,
deviceVector<RealF*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
ComplexD alpha,
deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn,
ComplexD beta,
deviceVector<ComplexD*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<ComplexD> alpha_p(1);
static deviceVector<ComplexD> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
RealD t0=usecond();
// std::cout << "ZgemmBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasZgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasDoubleComplex *) &alpha_p[0],
(hipblasDoubleComplex **)&Amk[0], lda,
(hipblasDoubleComplex **)&Bkn[0], ldb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex **)&Cmn[0], ldc,
batchCount);
// std::cout << " hipblas return code " <<(int)err<<std::endl;
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasZgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(cuDoubleComplex *) &alpha_p[0],
(cuDoubleComplex **)&Amk[0], lda,
(cuDoubleComplex **)&Bkn[0], ldb,
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
}
}
}
#endif
// synchronise();
RealD t1=usecond();
RealD flops = 8.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n)*batchCount;
// std::cout <<GridLogMessage<< " batched Blas copy "<<(t0-t2)/1.e3 <<" ms "<<std::endl;
// std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< flops/(t1-t0)/1.e3 <<" GF/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
// std::cout <<GridLogMessage<< " batched Blas zGemm call "<<m<<","<<n<<","<<k<<" "<< bytes/(t1-t0)/1.e3 <<" GB/s "<<(t1-t0)/1.e3<<" ms "<<std::endl;
}
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
ComplexF alpha,
deviceVector<ComplexF*> &Amk, // pointer list to matrices
deviceVector<ComplexF*> &Bkn,
ComplexF beta,
deviceVector<ComplexF*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<ComplexF> alpha_p(1);
static deviceVector<ComplexF> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasCgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasComplex *) &alpha_p[0],
(hipblasComplex **)&Amk[0], lda,
(hipblasComplex **)&Bkn[0], ldb,
(hipblasComplex *) &beta_p[0],
(hipblasComplex **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasCgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(cuComplex *) &alpha_p[0],
(cuComplex **)&Amk[0], lda,
(cuComplex **)&Bkn[0], ldb,
(cuComplex *) &beta_p[0],
(cuComplex **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
ComplexF alphaf(real(alpha),imag(alpha));
ComplexF betaf(real(beta),imag(beta));
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexF c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alphaf)*c_mn + (betaf)*Cmn[p][mm + nn*ldc ];
}
}
}
#endif
RealD t1=usecond();
RealD flops = 8.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
}
///////////////////////////////////////////////////////////////////////////
// Single precision real GEMM
///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
RealF alpha,
deviceVector<RealF*> &Amk, // pointer list to matrices
deviceVector<RealF*> &Bkn,
RealF beta,
deviceVector<RealF*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<RealF> alpha_p(1);
static deviceVector<RealF> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasSgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(float *) &alpha_p[0],
(float **)&Amk[0], lda,
(float **)&Bkn[0], ldb,
(float *) &beta_p[0],
(float **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasSgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(float *) &alpha_p[0],
(float **)&Amk[0], lda,
(float **)&Bkn[0], ldb,
(float *) &beta_p[0],
(float **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
RealD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
}
}
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
}
///////////////////////////////////////////////////////////////////////////
// Double precision real GEMM
///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
RealD alpha,
deviceVector<RealD*> &Amk, // pointer list to matrices
deviceVector<RealD*> &Bkn,
RealD beta,
deviceVector<RealD*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<RealD> alpha_p(1);
static deviceVector<RealD> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasDgemmBatched(gridblasHandle,
HIPBLAS_OP_N,
HIPBLAS_OP_N,
m,n,k,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasDgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
/*
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t batchCount64=batchCount;
oneapi::mkl::blas::column_major::gemm_batch(*theGridAccelerator,
onemkl::transpose::N,
onemkl::transpose::N,
&m64,&n64,&k64,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
1,&batchCount64);
*/
//MKLs cblas_<T>gemm_batch & OneAPI
#warning "oneMKL implementation not built "
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
int sda = lda*k;
int sdb = ldb*k;
int sdc = ldc*n;
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
RealD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[p][mm + kk*lda ] * Bkn[p][kk + nn*ldb];
Cmn[p][mm + nn*ldc] = (alpha)*c_mn + (beta)*Cmn[p][mm + nn*ldc ];
}
}
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
}
////////////////////////////////////////////////////////////////////////////////////////////////
// Strided case used by benchmark, but generally unused in Grid
// Keep a code example in double complex, but don't generate the single and real variants for now
////////////////////////////////////////////////////////////////////////////////////////////////
void gemmStridedBatched(int m,int n, int k,
ComplexD alpha,
ComplexD* Amk, // pointer list to matrices
ComplexD* Bkn,
ComplexD beta,
ComplexD* Cmn,
int batchCount)
{
// Use C-row major storage, so transpose calls
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
int sda = m*k;
int sdb = k*n;
int sdc = m*n;
deviceVector<ComplexD> alpha_p(1);
deviceVector<ComplexD> beta_p(1);
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
// std::cout << "blasZgemmStridedBatched mnk "<<m<<","<<n<<","<<k<<" count "<<batchCount<<std::endl;
// std::cout << "blasZgemmStridedBatched ld "<<lda<<","<<ldb<<","<<ldc<<std::endl;
// std::cout << "blasZgemmStridedBatched sd "<<sda<<","<<sdb<<","<<sdc<<std::endl;
#ifdef GRID_HIP
auto err = hipblasZgemmStridedBatched(gridblasHandle,
HIPBLAS_OP_N,
HIPBLAS_OP_N,
m,n,k,
(hipblasDoubleComplex *) &alpha_p[0],
(hipblasDoubleComplex *) Amk, lda, sda,
(hipblasDoubleComplex *) Bkn, ldb, sdb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex *) Cmn, ldc, sdc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasZgemmStridedBatched(gridblasHandle,
CUBLAS_OP_N,
CUBLAS_OP_N,
m,n,k,
(cuDoubleComplex *) &alpha_p[0],
(cuDoubleComplex *) Amk, lda, sda,
(cuDoubleComplex *) Bkn, ldb, sdb,
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex *) Cmn, ldc, sdc,
batchCount);
#endif
#if defined(GRID_SYCL) || defined(GRID_ONE_MKL)
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
oneapi::mkl::transpose::N,
oneapi::mkl::transpose::N,
m,n,k,
alpha,
(const ComplexD *)Amk,lda,sda,
(const ComplexD *)Bkn,ldb,sdb,
beta,
(ComplexD *)Cmn,ldc,sdc,
batchCount);
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
// Need a default/reference implementation
for (int p = 0; p < batchCount; ++p) {
for (int mm = 0; mm < m; ++mm) {
for (int nn = 0; nn < n; ++nn) {
ComplexD c_mn(0.0);
for (int kk = 0; kk < k; ++kk)
c_mn += Amk[mm + kk*lda + p*sda] * Bkn[kk + nn*ldb + p*sdb];
Cmn[mm + nn*ldc + p*sdc] = (alpha)*c_mn + (beta)*Cmn[mm + nn*ldc + p*sdc];
}
}
}
#endif
}
double benchmark(int M, int N, int K, int BATCH)
{
int32_t N_A = M*K*BATCH;
int32_t N_B = K*N*BATCH;
int32_t N_C = M*N*BATCH;
deviceVector<ComplexD> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(ComplexD));
deviceVector<ComplexD> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(ComplexD));
deviceVector<ComplexD> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(ComplexD));
ComplexD alpha(1.0);
ComplexD beta (1.0);
RealD flops = 8.0*M*N*K*BATCH;
int ncall=10;
RealD t0 = usecond();
for(int i=0;i<ncall;i++){
gemmStridedBatched(M,N,K,
alpha,
&A[0], // m x k
&B[0], // k x n
beta,
&C[0], // m x n
BATCH);
}
synchronise();
RealD t1 = usecond();
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K)*BATCH;
flops = 8.0*M*N*K*BATCH*ncall;
flops = flops/(t1-t0)/1.e3;
return flops; // Returns gigaflops
}
};
NAMESPACE_END(Grid);

View File

@ -166,16 +166,16 @@ public:
rsqf[s] =rsq[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
// ps_d[s] = src_d;
precisionChange(ps_f[s],src_d);
precisionChangeFast(ps_f[s],src_d);
}
// r and p for primary
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
r_d = p_d;
//MdagM+m[0]
precisionChange(p_f,p_d);
precisionChangeFast(p_f,p_d);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
precisionChange(tmp_d,mmp_f);
precisionChangeFast(tmp_d,mmp_f);
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
@ -204,7 +204,7 @@ public:
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
precisionChange(psi_f[s],psi_d[s]);
precisionChangeFast(psi_f[s],psi_d[s]);
}
///////////////////////////////////////
@ -225,7 +225,7 @@ public:
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChange(r_f, r_d);
precisionChangeFast(r_f, r_d);
PrecChangeTimer.Stop();
AXPYTimer.Start();
@ -243,13 +243,13 @@ public:
cp=c;
PrecChangeTimer.Start();
precisionChange(p_f, p_d); //get back single prec search direction for linop
precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
PrecChangeTimer.Stop();
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChange(mmp_d, mmp_f); // From Float to Double
precisionChangeFast(mmp_d, mmp_f); // From Float to Double
PrecChangeTimer.Stop();
d=real(innerProduct(p_d,mmp_d));
@ -311,7 +311,7 @@ public:
SolverTimer.Stop();
for(int s=0;s<nshift;s++){
precisionChange(psi_d[s],psi_f[s]);
precisionChangeFast(psi_d[s],psi_f[s]);
}

View File

@ -211,7 +211,7 @@ public:
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
assert(norm2(tmp_d)< 1.0);
// assert(norm2(tmp_d)< 1.0e-4);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);

View File

@ -419,15 +419,14 @@ until convergence
}
}
if ( Nconv < Nstop ) {
if ( Nconv < Nstop )
std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl;
}
eval=eval2;
//Keep only converged
eval.resize(Nstop);// was Nconv
evec.resize(Nstop,grid);// was Nconv
eval.resize(Nconv);// Nstop?
evec.resize(Nconv,grid);// Nstop?
basisSortInPlace(evec,eval,reverse);
}

View File

@ -176,7 +176,6 @@ template<class T> using cshiftAllocator = std::allocator<T>;
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
NAMESPACE_END(Grid);

View File

@ -519,6 +519,7 @@ void MemoryManager::Audit(std::string s)
uint64_t LruBytes1=0;
uint64_t LruBytes2=0;
uint64_t LruCnt=0;
uint64_t LockedBytes=0;
std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
for(auto it=LRU.begin();it!=LRU.end();it++){

View File

@ -128,7 +128,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return xbytes+rbytes;
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{

View File

@ -91,59 +91,6 @@ void *SharedMemory::ShmBufferSelf(void)
//std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
return ShmCommBufs[ShmRank];
}
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
std::vector<int> primes({2,3,5});
int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
last_dim = dim;
break;
}
}
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension;
}
}
NAMESPACE_END(Grid);

View File

@ -27,10 +27,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
*************************************************************************************/
/* END LEGAL */
#define Mheader "SharedMemoryMpi: "
#include <Grid/GridCore.h>
#include <pwd.h>
#include <syscall.h>
#ifdef GRID_CUDA
#include <cuda_runtime_api.h>
@ -40,118 +39,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
#endif
#ifdef GRID_SYCL
#define GRID_SYCL_LEVEL_ZERO_IPC
#include <syscall.h>
#define SHM_SOCKETS
#endif
#include <sys/socket.h>
#include <sys/un.h>
NAMESPACE_BEGIN(Grid);
#ifdef SHM_SOCKETS
/*
* Barbaric extra intranode communication route in case we need sockets to pass FDs
* Forced by level_zero not being nicely designed
*/
static int sock;
static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
static char sock_path[256];
class UnixSockets {
public:
static void Open(int rank)
{
int errnum;
sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
unlink(sa_un.sun_path);
if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
perror("bind failure");
exit(EXIT_FAILURE);
}
}
static int RecvFileDescriptor(void)
{
int n;
int fd;
char buf[1];
struct iovec iov;
struct msghdr msg;
struct cmsghdr *cmsg;
char cms[CMSG_SPACE(sizeof(int))];
iov.iov_base = buf;
iov.iov_len = 1;
memset(&msg, 0, sizeof msg);
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = (caddr_t)cms;
msg.msg_controllen = sizeof cms;
if((n=recvmsg(sock, &msg, 0)) < 0) {
perror("recvmsg failed");
return -1;
}
if(n == 0){
perror("recvmsg returned 0");
return -1;
}
cmsg = CMSG_FIRSTHDR(&msg);
memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
return fd;
}
static void SendFileDescriptor(int fildes,int xmit_to_rank)
{
struct msghdr msg;
struct iovec iov;
struct cmsghdr *cmsg = NULL;
char ctrl[CMSG_SPACE(sizeof(int))];
char data = ' ';
memset(&msg, 0, sizeof(struct msghdr));
memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
iov.iov_base = &data;
iov.iov_len = sizeof(data);
sprintf(sock_path,sock_path_fmt,xmit_to_rank);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
msg.msg_name = (void *)&sa_un;
msg.msg_namelen = sizeof(sa_un);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_controllen = CMSG_SPACE(sizeof(int));
msg.msg_control = ctrl;
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
*((int *) CMSG_DATA(cmsg)) = fildes;
sendmsg(sock, &msg, 0);
};
};
#endif
#define header "SharedMemoryMpi: "
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
@ -174,8 +66,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
if ( WorldRank == 0) {
std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;
std::cout << Mheader " Node communicator of size " <<WorldShmSize << std::endl;
std::cout << header " World communicator of size " <<WorldSize << std::endl;
std::cout << header " Node communicator of size " <<WorldShmSize << std::endl;
}
// WorldShmComm, WorldShmSize, WorldShmRank
@ -278,7 +170,59 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
else OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
}
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
std::vector<int> primes({2,3,5});
int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
last_dim = dim;
break;
}
}
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension;
}
}
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
////////////////////////////////////////////////////////////////
@ -452,7 +396,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
#ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
@ -537,7 +481,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
exit(EXIT_FAILURE);
}
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
SharedMemoryZero(ShmCommBuf,bytes);
@ -580,7 +524,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
@ -588,13 +532,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef SHM_SOCKETS
UnixSockets::Open(WorldShmRank);
#endif
for(int r=0;r<WorldShmSize;r++){
MPI_Barrier(WorldShmComm);
#ifndef GRID_MPI3_SHM_NONE
//////////////////////////////////////////////////
// If it is me, pass around the IPC access key
@ -602,32 +541,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void * thisBuf = ShmCommBuf;
if(!Stencil_force_mpi) {
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
typedef struct { int fd; pid_t pid ; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle;
clone_mem_t handle;
if ( r==WorldShmRank ) {
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
}
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid();
memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
#ifdef SHM_SOCKETS
for(int rr=0;rr<WorldShmSize;rr++){
if(rr!=r){
UnixSockets::SendFileDescriptor(handle.fd,rr);
}
}
#endif
}
#endif
#ifdef GRID_CUDA
@ -655,7 +586,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Share this IPC handle across the Shm Comm
//////////////////////////////////////////////////
{
MPI_Barrier(WorldShmComm);
int ierr=MPI_Bcast(&handle,
sizeof(handle),
MPI_BYTE,
@ -671,10 +601,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
if ( r!=WorldShmRank ) {
thisBuf = nullptr;
int myfd;
#ifdef SHM_SOCKETS
myfd=UnixSockets::RecvFileDescriptor();
#else
std::cout<<"mapping seeking remote pid/fd "
<<handle.pid<<"/"
<<handle.fd<<std::endl;
@ -682,22 +608,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
myfd = syscall(438,pidfd,handle.fd,0);
int err_t = errno;
if (myfd < 0) {
fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
perror("pidfd_getfd failed ");
assert(0);
}
#endif
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
int myfd = syscall(438,pidfd,handle.fd,0);
std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
@ -732,7 +652,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#else
WorldShmCommBufs[r] = ShmCommBuf;
#endif
MPI_Barrier(WorldShmComm);
}
_ShmAllocBytes=bytes;
@ -744,7 +663,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -781,7 +700,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
// std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
@ -791,7 +710,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -838,7 +757,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm);

View File

@ -29,27 +29,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
extern std::vector<std::pair<int,int> > Cshift_table;
extern commVector<std::pair<int,int> > Cshift_table_device;
extern Vector<std::pair<int,int> > Cshift_table;
inline std::pair<int,int> *MapCshiftTable(void)
{
// GPU version
#ifdef ACCELERATOR_CSHIFT
uint64_t sz=Cshift_table.size();
if (Cshift_table_device.size()!=sz ) {
Cshift_table_device.resize(sz);
}
acceleratorCopyToDevice((void *)&Cshift_table[0],
(void *)&Cshift_table_device[0],
sizeof(Cshift_table[0])*sz);
return &Cshift_table_device[0];
#else
return &Cshift_table[0];
#endif
// CPU version use identify map
}
///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
///////////////////////////////////////////////////////////////////
@ -93,8 +74,8 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
}
{
auto buffer_p = & buffer[0];
auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
@ -244,7 +225,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
{
auto buffer_p = & buffer[0];
auto table = MapCshiftTable();
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
@ -316,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
}
}
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
template <typename T>
T iDivUp(T a, T b) // Round a / b to nearest higher integer value
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
template <typename T>
__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx >= e1*e2) return;
int n, b, o;
n = idx / e2;
b = idx % e2;
o = n*stride + b;
vector[2*idx + 0] = lo + o;
vector[2*idx + 1] = ro + o;
}
#endif
//////////////////////////////////////////////////////
// local to node block strided copies
//////////////////////////////////////////////////////
@ -340,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int ent=0;
if(cbmask == 0x3 ){
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
#endif
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
@ -359,7 +372,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
}
{
auto table = MapCshiftTable();
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite);
@ -396,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int ent=0;
if ( cbmask == 0x3 ) {
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
#endif
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
@ -411,7 +432,7 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
}
{
auto table = MapCshiftTable();
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);

View File

@ -52,8 +52,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);
RealD t1,t0;
t0=usecond();
if ( !comm_dim ) {
//std::cout << "CSHIFT: Cshift_local" <<std::endl;
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
@ -64,8 +63,6 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
//std::cout << "CSHIFT: Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift);
}
t1=usecond();
// std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
return ret;
}
@ -130,20 +127,16 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
} else {
int words = buffer_size;
@ -151,39 +144,26 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int bytes = words * sizeof(vobj);
tgather-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
tgather+=usecond();
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
grid->Barrier();
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
tscatter-=usecond();
grid->Barrier();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
tscatter+=usecond();
}
}
/*
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@ -210,12 +190,6 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
@ -253,9 +227,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
pointers[i] = &send_buf_extract[i][0];
}
int sx = (x+sshift)%rd;
tgather-=usecond();
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
tgather+=usecond();
for(int i=0;i<Nsimd;i++){
@ -280,8 +252,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0];
@ -291,9 +262,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
recv_from_rank,
bytes);
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
@ -301,17 +270,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
}
}
tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond();
}
/*
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@ -331,11 +292,6 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
@ -359,9 +315,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
if (comm_proc==0) {
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
} else {
@ -370,9 +324,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int bytes = words * sizeof(vobj);
tgather-=usecond();
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
tgather+=usecond();
// int rank = grid->_processor;
int recv_from_rank;
@ -380,8 +332,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
@ -389,24 +340,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
(void *)&recv_buf[0],
recv_from_rank,
bytes);
xbytes+=bytes;
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
// grid->Barrier();
tcomms+=usecond();
grid->Barrier();
tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
tscatter+=usecond();
}
}
/*
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@ -432,11 +372,6 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int permute_type=grid->PermuteType(dimension);
@ -479,10 +414,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
tgather-=usecond();
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
tgather+=usecond();
for(int i=0;i<Nsimd;i++){
@ -507,8 +440,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
@ -517,28 +449,17 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond();
}
/*
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
*/
}
#endif
NAMESPACE_END(Grid);

View File

@ -1,5 +1,4 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
std::vector<std::pair<int,int> > Cshift_table;
commVector<std::pair<int,int> > Cshift_table_device;
Vector<std::pair<int,int> > Cshift_table;
NAMESPACE_END(Grid);

View File

@ -35,7 +35,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_transpose.h>
#include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_crc.h>
#include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_real_imag.h>
@ -47,4 +46,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h>
#include <Grid/lattice/PaddedCell.h>
#include <Grid/lattice/Lattice_crc.h>

View File

@ -345,9 +345,7 @@ GridUnopClass(UnaryNot, Not(a));
GridUnopClass(UnaryTrace, trace(a));
GridUnopClass(UnaryTranspose, transpose(a));
GridUnopClass(UnaryTa, Ta(a));
GridUnopClass(UnarySpTa, SpTa(a));
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
GridUnopClass(UnaryProjectOnSpGroup, ProjectOnSpGroup(a));
GridUnopClass(UnaryTimesI, timesI(a));
GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
GridUnopClass(UnaryAbs, abs(a));
@ -458,9 +456,7 @@ GRID_DEF_UNOP(operator!, UnaryNot);
GRID_DEF_UNOP(trace, UnaryTrace);
GRID_DEF_UNOP(transpose, UnaryTranspose);
GRID_DEF_UNOP(Ta, UnaryTa);
GRID_DEF_UNOP(SpTa, UnarySpTa);
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
GRID_DEF_UNOP(ProjectOnSpGroup, UnaryProjectOnSpGroup);
GRID_DEF_UNOP(timesI, UnaryTimesI);
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the

View File

@ -270,42 +270,5 @@ RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const L
return axpby_norm_fast(ret,a,b,x,y);
}
/// Trace product
template<class obj> auto traceProduct(const Lattice<obj> &rhs_1,const Lattice<obj> &rhs_2)
-> Lattice<decltype(trace(obj()))>
{
typedef decltype(trace(obj())) robj;
Lattice<robj> ret_i(rhs_1.Grid());
autoView( rhs1 , rhs_1, AcceleratorRead);
autoView( rhs2 , rhs_2, AcceleratorRead);
autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs_1.Checkerboard();
accelerator_for(ss,rhs1.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2(ss)));
});
return ret_i;
}
template<class obj1,class obj2> auto traceProduct(const Lattice<obj1> &rhs_1,const obj2 &rhs2)
-> Lattice<decltype(trace(obj1()))>
{
typedef decltype(trace(obj1())) robj;
Lattice<robj> ret_i(rhs_1.Grid());
autoView( rhs1 , rhs_1, AcceleratorRead);
autoView( ret , ret_i, AcceleratorWrite);
ret.Checkerboard() = rhs_1.Checkerboard();
accelerator_for(ss,rhs1.size(),obj1::Nsimd(),{
coalescedWrite(ret[ss],traceProduct(rhs1(ss),rhs2));
});
return ret_i;
}
template<class obj1,class obj2> auto traceProduct(const obj2 &rhs_2,const Lattice<obj1> &rhs_1)
-> Lattice<decltype(trace(obj1()))>
{
return traceProduct(rhs_1,rhs_2);
}
NAMESPACE_END(Grid);
#endif

View File

@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
basis_v.push_back(basis[k].View(AcceleratorWrite));
}
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
#if ( (!defined(GRID_CUDA)) )
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region

View File

@ -42,13 +42,13 @@ template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1
}
}
template<class vobj> uint32_t crc(const Lattice<vobj> & buf)
template<class vobj> uint32_t crc(Lattice<vobj> & buf)
{
autoView( buf_v , buf, CpuRead);
return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
}
#define CRC(U) std::cerr << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
NAMESPACE_END(Grid);

View File

@ -31,7 +31,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
#if defined(GRID_SYCL)
#include <Grid/lattice/Lattice_reduction_sycl.h>
#endif
#include <Grid/lattice/Lattice_slicesum_core.h>
NAMESPACE_BEGIN(Grid);
@ -281,17 +280,11 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
return nrm;
}
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid();
uint32_t csum=0;
// Uint32Checksum(left,csum);
ComplexD nrm = rankInnerProduct(left,right);
RealD local = real(nrm);
GridNormLog(real(nrm),csum); // Could log before and after global sum to distinguish local and MPI
grid->GlobalSum(nrm);
GridMPINormLog(local,real(nrm));
return nrm;
}
@ -455,10 +448,19 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
int ostride=grid->_ostride[orthogdim];
//Reduce Data down to lvSum
sliceSumReduction(Data,lvSum,rd, e1,e2,stride,ostride,Nsimd);
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
autoView( Data_v, Data, CpuRead);
thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data_v[ss];
}
}
});
// Sum across simd lanes in the plane, breaking out orthog dir.
Coordinate icoor(Nd);
@ -502,7 +504,6 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
return result;
}
template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{

View File

@ -30,7 +30,7 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
cudaGetDevice(&device);
#endif
#ifdef GRID_HIP
auto r=hipGetDevice(&device);
hipGetDevice(&device);
#endif
Iterator warpSize = gpu_props[device].warpSize;

View File

@ -152,7 +152,6 @@ public:
#ifdef RNG_FAST_DISCARD
static void Skip(RngEngine &eng,uint64_t site)
{
#if 0
/////////////////////////////////////////////////////////////////////////////////////
// Skip by 2^40 elements between successive lattice sites
// This goes by 10^12.
@ -163,9 +162,9 @@ public:
// tens of seconds per trajectory so this is clean in all reasonable cases,
// and margin of safety is orders of magnitude.
// We could hack Sitmo to skip in the higher order words of state if necessary
//
// Replace with 2^30 ; avoid problem on large volumes
//
//
// Replace with 2^30 ; avoid problem on large volumes
//
/////////////////////////////////////////////////////////////////////////////////////
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
const int shift = 30;
@ -180,9 +179,6 @@ public:
assert((skip >> shift)==site); // check for overflow
eng.discard(skip);
#else
eng.discardhi(site);
#endif
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
}
#endif
@ -411,7 +407,7 @@ public:
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
SeedFixedIntegers(seeds);
}
void SeedFixedIntegers(const std::vector<int> &seeds, int britney=0){
void SeedFixedIntegers(const std::vector<int> &seeds){
// Everyone generates the same seed_seq based on input seeds
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
@ -428,6 +424,7 @@ public:
// MT implementation does not implement fast discard even though
// in principle this is possible
////////////////////////////////////////////////
#if 1
thread_for( lidx, _grid->lSites(), {
int gidx;
@ -448,12 +445,29 @@ public:
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
if ( britney ) {
Skip(_generators[l_idx],l_idx); // Skip to next RNG sequence
} else {
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
});
#else
// Everybody loops over global volume.
thread_for( gidx, _grid->_gsites, {
// Where is it?
int rank;
int o_idx;
int i_idx;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// If this is one of mine we take it
if( rank == _grid->ThisRank() ){
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
}
});
#endif
#else
////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient

View File

@ -1,213 +0,0 @@
#pragma once
#include <type_traits>
#if defined(GRID_CUDA)
#include <cub/cub.cuh>
#define gpucub cub
#define gpuError_t cudaError_t
#define gpuSuccess cudaSuccess
#elif defined(GRID_HIP)
#include <hipcub/hipcub.hpp>
#define gpucub hipcub
#define gpuError_t hipError_t
#define gpuSuccess hipSuccess
#endif
NAMESPACE_BEGIN(Grid);
#if defined(GRID_CUDA) || defined(GRID_HIP)
template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
size_t subvol_size = e1*e2;
commVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0];
vobj zero_init;
zeroit(zero_init);
void *temp_storage_array = NULL;
size_t temp_storage_bytes = 0;
vobj *d_out;
int* d_offsets;
std::vector<int> offsets(rd+1,0);
for (int i = 0; i < offsets.size(); i++) {
offsets[i] = i*subvol_size;
}
//Allocate memory for output and offset arrays on device
d_out = static_cast<vobj*>(acceleratorAllocDevice(rd*sizeof(vobj)));
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
//copy offsets to device
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
if (gpuErr!=gpuSuccess) {
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
exit(EXIT_FAILURE);
}
//allocate memory for temp_storage_array
temp_storage_array = acceleratorAllocDevice(temp_storage_bytes);
//prepare buffer for reduction
//use non-blocking accelerator_for to avoid syncs (ok because we submit to same computeStream)
//use 2d accelerator_for to avoid launch latencies found when serially looping over rd
accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{
int n = s / e2;
int b = s % e2;
int so=r*ostride; // base offset for start of plane
int ss= so+n*stride+b;
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data[ss]));
});
//issue segmented reductions in computeStream
gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
if (gpuErr!=gpuSuccess) {
std::cout << GridLogError << "Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
exit(EXIT_FAILURE);
}
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
//sync after copy
accelerator_barrier();
acceleratorFreeDevice(temp_storage_array);
acceleratorFreeDevice(d_out);
acceleratorFreeDevice(d_offsets);
}
template<class vobj> inline void sliceSumReduction_cub_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
typedef typename vobj::vector_type vector;
const int words = sizeof(vobj)/sizeof(vector);
const int osites = rd*e1*e2;
commVector<vector>buffer(osites);
vector *dat = (vector *)Data;
vector *buf = &buffer[0];
Vector<vector> lvSum_small(rd);
vector *lvSum_ptr = (vector *)&lvSum[0];
for (int w = 0; w < words; w++) {
accelerator_for(ss,osites,1,{
buf[ss] = dat[ss*words+w];
});
sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
for (int r = 0; r < rd; r++) {
lvSum_ptr[w+words*r]=lvSum_small[r];
}
}
}
template<class vobj> inline void sliceSumReduction_cub(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
{
autoView(Data_v, Data, AcceleratorRead); //hipcub/cub cannot deal with large vobjs so we split into small/large case.
if constexpr (sizeof(vobj) <= 256) {
sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
}
else {
sliceSumReduction_cub_large(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
}
}
#endif
#if defined(GRID_SYCL)
template<class vobj> inline void sliceSumReduction_sycl(const Lattice<vobj> &Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
{
typedef typename vobj::scalar_object sobj;
size_t subvol_size = e1*e2;
vobj *mysum = (vobj *) malloc_shared(sizeof(vobj),*theGridAccelerator);
vobj vobj_zero;
zeroit(vobj_zero);
commVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0];
autoView(Data_v, Data, AcceleratorRead);
//prepare reduction buffer
accelerator_for2d( s,subvol_size, r,rd, (size_t)Nsimd,{
int n = s / e2;
int b = s % e2;
int so=r*ostride; // base offset for start of plane
int ss= so+n*stride+b;
coalescedWrite(rb_p[r*subvol_size+s], coalescedRead(Data_v[ss]));
});
for (int r = 0; r < rd; r++) {
mysum[0] = vobj_zero; //dirty hack: cannot pass vobj_zero as identity to sycl::reduction as its not device_copyable
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(mysum,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
Reduction,
[=](cl::sycl::id<1> item, auto &sum) {
auto s = item[0];
sum += rb_p[r*subvol_size+s];
});
});
theGridAccelerator->wait();
lvSum[r] = mysum[0];
}
free(mysum,*theGridAccelerator);
}
#endif
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
{
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
autoView( Data_v, Data, CpuRead);
thread_for( r,rd, {
int so=r*ostride; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data_v[ss];
}
}
});
}
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
{
#if defined(GRID_CUDA) || defined(GRID_HIP)
sliceSumReduction_cub(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#elif defined(GRID_SYCL)
sliceSumReduction_sycl(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#else
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#endif
}
NAMESPACE_END(Grid);

View File

@ -66,65 +66,6 @@ inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<
return ret;
};
template<int N, class Vec>
Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
{
GridBase *grid=Umu.Grid();
auto lvol = grid->lSites();
Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
typedef typename Vec::scalar_type scalar;
autoView(Umu_v,Umu,CpuRead);
autoView(ret_v,ret,CpuWrite);
thread_for(site,lvol,{
Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
iScalar<iScalar<iMatrix<scalar, N> > > Us;
peekLocalSite(Us, Umu_v, lcoor);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
scalar tmp= Us()()(i,j);
ComplexD ztmp(real(tmp),imag(tmp));
EigenU(i,j)=ztmp;
}}
ComplexD detD = EigenU.determinant();
typename Vec::scalar_type det(detD.real(),detD.imag());
pokeLocalSite(det,ret_v,lcoor);
});
return ret;
}
template<int N>
Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
{
GridBase *grid=Umu.Grid();
auto lvol = grid->lSites();
Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid);
autoView(Umu_v,Umu,CpuRead);
autoView(ret_v,ret,CpuWrite);
thread_for(site,lvol,{
Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
iScalar<iScalar<iMatrix<ComplexD, N> > > Ui;
peekLocalSite(Us, Umu_v, lcoor);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
EigenU(i,j) = Us()()(i,j);
}}
Eigen::MatrixXcd EigenUinv = EigenU.inverse();
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
Ui()()(i,j) = EigenUinv(i,j);
}}
pokeLocalSite(Ui,ret_v,lcoor);
});
return ret;
}
NAMESPACE_END(Grid);
#endif

View File

@ -469,13 +469,15 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
Coordinate fine_rdimensions = fine->_rdimensions;
Coordinate coarse_rdimensions = coarse->_rdimensions;
vobj zz = Zero();
accelerator_for(sc,coarse->oSites(),1,{
// One thread per sub block
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
vobj cd = Zero();
vobj cd = zz;
for(int sb=0;sb<blockVol;sb++){
@ -695,68 +697,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
for(int d=0;d<nd;d++){
assert(Fg->_processors[d] == Tg->_processors[d]);
}
// the above should guarantee that the operations are local
#if 1
size_t nsite = 1;
for(int i=0;i<nd;i++) nsite *= RegionSize[i];
size_t tbytes = 4*nsite*sizeof(int);
int *table = (int*)malloc(tbytes);
thread_for(idx, nsite, {
Coordinate from_coor, to_coor;
size_t rem = idx;
for(int i=0;i<nd;i++){
size_t base_i = rem % RegionSize[i]; rem /= RegionSize[i];
from_coor[i] = base_i + FromLowerLeft[i];
to_coor[i] = base_i + ToLowerLeft[i];
}
int foidx = Fg->oIndex(from_coor);
int fiidx = Fg->iIndex(from_coor);
int toidx = Tg->oIndex(to_coor);
int tiidx = Tg->iIndex(to_coor);
int* tt = table + 4*idx;
tt[0] = foidx;
tt[1] = fiidx;
tt[2] = toidx;
tt[3] = tiidx;
});
int* table_d = (int*)acceleratorAllocDevice(tbytes);
acceleratorCopyToDevice(table,table_d,tbytes);
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
autoView(from_v,From,AcceleratorRead);
autoView(to_v,To,AcceleratorWrite);
accelerator_for(idx,nsite,1,{
static const int words=sizeof(vobj)/sizeof(vector_type);
int* tt = table_d + 4*idx;
int from_oidx = *tt++;
int from_lane = *tt++;
int to_oidx = *tt++;
int to_lane = *tt;
const vector_type* from = (const vector_type *)&from_v[from_oidx];
vector_type* to = (vector_type *)&to_v[to_oidx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], from_lane);
putlane(to[w], stmp, to_lane);
}
});
acceleratorFreeDevice(table_d);
free(table);
#else
Coordinate ldf = Fg->_ldimensions;
Coordinate rdf = Fg->_rdimensions;
Coordinate isf = Fg->_istride;
@ -765,9 +707,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride;
autoView( t_v , To, CpuWrite);
autoView( f_v , From, CpuRead);
thread_for(idx,Fg->lSites(),{
autoView( t_v , To, AcceleratorWrite);
autoView( f_v , From, AcceleratorRead);
accelerator_for(idx,Fg->lSites(),1,{
sobj s;
Coordinate Fcoor(nd);
Coordinate Tcoor(nd);
@ -780,24 +722,17 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
}
if (in_region) {
#if 0
Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
scalar_type * fp = (scalar_type *)&f_v[odx_f];
scalar_type * tp = (scalar_type *)&t_v[odx_t];
Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
vector_type * fp = (vector_type *)&f_v[odx_f];
vector_type * tp = (vector_type *)&t_v[odx_t];
for(int w=0;w<words;w++){
tp[w].putlane(fp[w].getlane(idx_f),idx_t);
}
#else
peekLocalSite(s,f_v,Fcoor);
pokeLocalSite(s,t_v,Tcoor);
#endif
}
});
#endif
}
@ -890,8 +825,6 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
}
//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
template<class vobj>
void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
{
@ -908,70 +841,11 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
for(int d=0;d<nh;d++){
if ( d!=orthog ) {
assert(lg->_processors[d] == hg->_processors[d]);
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
}
assert(lg->_processors[d] == hg->_processors[d]);
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
}
}
#if 1
size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
size_t tbytes = 4*nsite*sizeof(int);
int *table = (int*)malloc(tbytes);
thread_for(idx,nsite,{
Coordinate lcoor(nl);
Coordinate hcoor(nh);
lcoor[orthog] = slice_lo;
hcoor[orthog] = slice_hi;
size_t rem = idx;
for(int mu=0;mu<nl;mu++){
if(mu != orthog){
int xmu = rem % lg->LocalDimensions()[mu]; rem /= lg->LocalDimensions()[mu];
lcoor[mu] = hcoor[mu] = xmu;
}
}
int loidx = lg->oIndex(lcoor);
int liidx = lg->iIndex(lcoor);
int hoidx = hg->oIndex(hcoor);
int hiidx = hg->iIndex(hcoor);
int* tt = table + 4*idx;
tt[0] = loidx;
tt[1] = liidx;
tt[2] = hoidx;
tt[3] = hiidx;
});
int* table_d = (int*)acceleratorAllocDevice(tbytes);
acceleratorCopyToDevice(table,table_d,tbytes);
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
autoView(lowDim_v,lowDim,AcceleratorRead);
autoView(higherDim_v,higherDim,AcceleratorWrite);
accelerator_for(idx,nsite,1,{
static const int words=sizeof(vobj)/sizeof(vector_type);
int* tt = table_d + 4*idx;
int from_oidx = *tt++;
int from_lane = *tt++;
int to_oidx = *tt++;
int to_lane = *tt;
const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
vector_type* to = (vector_type *)&higherDim_v[to_oidx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], from_lane);
putlane(to[w], stmp, to_lane);
}
});
acceleratorFreeDevice(table_d);
free(table);
#else
// the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
@ -987,7 +861,6 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
pokeLocalSite(s,higherDimv,hcoor);
}
});
#endif
}

View File

@ -45,7 +45,6 @@ public:
};
// Host only
GridBase * getGrid(void) const { return _grid; };
vobj* getHostPointer(void) const { return _odata; };
};
/////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,174 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/PaddedCell.h
Copyright (C) 2019
Author: Peter Boyle pboyle@bnl.gov
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include<Grid/cshift/Cshift.h>
NAMESPACE_BEGIN(Grid);
//Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions
template<typename vobj>
struct CshiftImplBase{
virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0;
virtual ~CshiftImplBase(){}
};
template<typename vobj>
struct CshiftImplDefault: public CshiftImplBase<vobj>{
Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); }
};
template<typename Gimpl>
struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{
typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
};
class PaddedCell {
public:
GridCartesian * unpadded_grid;
int dims;
int depth;
std::vector<GridCartesian *> grids;
~PaddedCell()
{
DeleteGrids();
}
PaddedCell(int _depth,GridCartesian *_grid)
{
unpadded_grid = _grid;
depth=_depth;
dims=_grid->Nd();
AllocateGrids();
Coordinate local =unpadded_grid->LocalDimensions();
for(int d=0;d<dims;d++){
assert(local[d]>=depth);
}
}
void DeleteGrids(void)
{
for(int d=0;d<grids.size();d++){
delete grids[d];
}
grids.resize(0);
};
void AllocateGrids(void)
{
Coordinate local =unpadded_grid->LocalDimensions();
Coordinate simd =unpadded_grid->_simd_layout;
Coordinate processors=unpadded_grid->_processors;
Coordinate plocal =unpadded_grid->LocalDimensions();
Coordinate global(dims);
// expand up one dim at a time
for(int d=0;d<dims;d++){
plocal[d] += 2*depth;
for(int d=0;d<dims;d++){
global[d] = plocal[d]*processors[d];
}
grids.push_back(new GridCartesian(global,simd,processors));
}
};
template<class vobj>
inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
{
Lattice<vobj> out(unpadded_grid);
Coordinate local =unpadded_grid->LocalDimensions();
Coordinate fll(dims,depth); // depends on the MPI spread
Coordinate tll(dims,0); // depends on the MPI spread
localCopyRegion(in,out,fll,tll,local);
return out;
}
template<class vobj>
inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
{
GridBase *old_grid = in.Grid();
int dims = old_grid->Nd();
Lattice<vobj> tmp = in;
for(int d=0;d<dims;d++){
tmp = Expand(d,tmp,cshift); // rvalue && assignment
}
return tmp;
}
// expand up one dim at a time
template<class vobj>
inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
{
GridBase *old_grid = in.Grid();
GridCartesian *new_grid = grids[dim];//These are new grids
Lattice<vobj> padded(new_grid);
Lattice<vobj> shifted(old_grid);
Coordinate local =old_grid->LocalDimensions();
Coordinate plocal =new_grid->LocalDimensions();
if(dim==0) conformable(old_grid,unpadded_grid);
else conformable(old_grid,grids[dim-1]);
std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
double tins=0, tshift=0;
// Middle bit
double t = usecond();
for(int x=0;x<local[dim];x++){
InsertSliceLocal(in,padded,x,depth+x,dim);
}
tins += usecond() - t;
// High bit
t = usecond();
shifted = cshift.Cshift(in,dim,depth);
tshift += usecond() - t;
t=usecond();
for(int x=0;x<depth;x++){
InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
}
tins += usecond() - t;
// Low bit
t = usecond();
shifted = cshift.Cshift(in,dim,-depth);
tshift += usecond() - t;
t = usecond();
for(int x=0;x<depth;x++){
InsertSliceLocal(shifted,padded,x,x,dim);
}
tins += usecond() - t;
std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
return padded;
}
};
NAMESPACE_END(Grid);

View File

@ -179,11 +179,11 @@ extern GridLogger GridLogSolver;
extern GridLogger GridLogError;
extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage;
extern GridLogger GridLogDebug;
extern GridLogger GridLogDebug ;
extern GridLogger GridLogPerformance;
extern GridLogger GridLogDslash;
extern GridLogger GridLogIterative;
extern GridLogger GridLogIntegrator;
extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ;
extern GridLogger GridLogHMC;
extern GridLogger GridLogMemory;
extern GridLogger GridLogTracing;
@ -191,41 +191,6 @@ extern Colours GridLogColours;
std::string demangle(const char* name) ;
template<typename... Args>
inline std::string sjoin(Args&&... args) noexcept {
std::ostringstream msg;
(msg << ... << args);
return msg.str();
}
/*! @brief make log messages work like python print */
template <typename... Args>
inline void Grid_log(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << GridLogMessage << msg << std::endl;
}
/*! @brief make warning messages work like python print */
template <typename... Args>
inline void Grid_warn(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[33m" << GridLogWarning << msg << "\033[0m" << std::endl;
}
/*! @brief make error messages work like python print */
template <typename... Args>
inline void Grid_error(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[31m" << GridLogError << msg << "\033[0m" << std::endl;
}
/*! @brief make pass messages work like python print */
template <typename... Args>
inline void Grid_pass(Args&&... args) {
std::string msg = sjoin(std::forward<Args>(args)...);
std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl;
}
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];

View File

@ -34,7 +34,7 @@ class GridTracer {
};
inline void tracePush(const char *name) { roctxRangePushA(name); }
inline void tracePop(const char *name) { roctxRangePop(); }
inline int traceStart(const char *name) { return roctxRangeStart(name); }
inline int traceStart(const char *name) { roctxRangeStart(name); }
inline void traceStop(int ID) { roctxRangeStop(ID); }
#endif

View File

@ -104,7 +104,6 @@ template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iSca
template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
template<typename vtype> using iLorentzComplex = iVector<iScalar<iScalar<vtype> >, Nd > ;
template<typename vtype> using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
@ -179,15 +178,6 @@ typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2;
// LorentzComplex
typedef iLorentzComplex<Complex > LorentzComplex;
typedef iLorentzComplex<ComplexF > LorentzComplexF;
typedef iLorentzComplex<ComplexD > LorentzComplexD;
typedef iLorentzComplex<vComplex > vLorentzComplex;
typedef iLorentzComplex<vComplexF> vLorentzComplexF;
typedef iLorentzComplex<vComplexD> vLorentzComplexD;
// DoubleStored gauge field
typedef iDoubleStoredColourMatrix<Complex > DoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
@ -317,10 +307,6 @@ typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2;
typedef Lattice<vLorentzComplex> LatticeLorentzComplex;
typedef Lattice<vLorentzComplexF> LatticeLorentzComplexF;
typedef Lattice<vLorentzComplexD> LatticeLorentzComplexD;
// DoubleStored gauge field
typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix;
typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;

View File

@ -34,24 +34,10 @@ directory
NAMESPACE_BEGIN(Grid);
///////////////////////////////////
// Smart configuration base class
///////////////////////////////////
template< class Field >
class ConfigurationBase
{
public:
ConfigurationBase() {}
virtual ~ConfigurationBase() {}
virtual void set_Field(Field& U) =0;
virtual void smeared_force(Field&) = 0;
virtual Field& get_SmearedU() =0;
virtual Field &get_U(bool smeared = false) = 0;
};
template <class GaugeField >
class Action
{
public:
bool is_smeared = false;
RealD deriv_norm_sum;
@ -91,60 +77,16 @@ public:
void refresh_timer_stop(void) { refresh_us+=usecond(); }
void S_timer_start(void) { S_us-=usecond(); }
void S_timer_stop(void) { S_us+=usecond(); }
/////////////////////////////
// Heatbath?
/////////////////////////////
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
/////////////////////////////////////////////////////////////
// virtual smeared interface through configuration container
/////////////////////////////////////////////////////////////
virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
{
refresh(U.get_U(is_smeared),sRNG,pRNG);
}
virtual RealD S(ConfigurationBase<GaugeField>& U)
{
return S(U.get_U(is_smeared));
}
virtual RealD Sinitial(ConfigurationBase<GaugeField>& U)
{
return Sinitial(U.get_U(is_smeared));
}
virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
{
deriv(U.get_U(is_smeared),dSdU);
if ( is_smeared ) {
U.smeared_force(dSdU);
}
}
///////////////////////////////
// Logging
///////////////////////////////
virtual std::string action_name() = 0; // return the action name
virtual std::string LogParameters() = 0; // prints action parameters
virtual ~Action(){}
};
template <class GaugeField >
class EmptyAction : public Action <GaugeField>
{
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative
///////////////////////////////
// Logging
///////////////////////////////
virtual std::string action_name() { return std::string("Level Force Log"); };
virtual std::string LogParameters() { return std::string("No parameters");};
};
NAMESPACE_END(Grid);
#endif // ACTION_BASE_H

View File

@ -30,8 +30,6 @@ directory
#ifndef QCD_ACTION_CORE
#define QCD_ACTION_CORE
#include <Grid/qcd/action/gauge/GaugeImplementations.h>
#include <Grid/qcd/action/ActionBase.h>
NAMESPACE_CHECK(ActionBase);
#include <Grid/qcd/action/ActionSet.h>

View File

@ -126,16 +126,6 @@ typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermi
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
// Sp(2n)
typedef WilsonFermion<SpWilsonImplF> SpWilsonFermionF;
typedef WilsonFermion<SpWilsonImplD> SpWilsonFermionD;
typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplF> SpWilsonTwoIndexAntiSymmetricFermionF;
typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplD> SpWilsonTwoIndexAntiSymmetricFermionD;
typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplF> SpWilsonTwoIndexSymmetricFermionF;
typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplD> SpWilsonTwoIndexSymmetricFermionD;
// Twisted mass fermion
typedef WilsonTMFermion<WilsonImplD2> WilsonTMFermionD2;
typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;

View File

@ -507,7 +507,6 @@ public:
}
this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size);
accelerator_barrier();
}
};

View File

@ -261,22 +261,6 @@ typedef WilsonImpl<vComplex, TwoIndexAntiSymmetricRepresentation, CoeffReal > W
typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD; // Double
//sp 2n
typedef WilsonImpl<vComplex, SpFundamentalRepresentation, CoeffReal > SpWilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, SpFundamentalRepresentation, CoeffReal > SpWilsonImplF; // Float
typedef WilsonImpl<vComplexD, SpFundamentalRepresentation, CoeffReal > SpWilsonImplD; // Double
typedef WilsonImpl<vComplex, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplD; // Double
typedef WilsonImpl<vComplex, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplD; // Double
typedef WilsonImpl<vComplex, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplR; // Real.. whichever prec // adj = 2indx symmetric for Sp(2N)
typedef WilsonImpl<vComplexF, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplF; // Float // adj = 2indx symmetric for Sp(2N)
typedef WilsonImpl<vComplexD, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplD; // Double // adj = 2indx symmetric for Sp(2N)
NAMESPACE_END(Grid);

View File

@ -63,9 +63,7 @@ public:
virtual void MooeeDag(const FermionField &in, FermionField &out) ;
virtual void MooeeInv(const FermionField &in, FermionField &out) ;
virtual void MooeeInvDag(const FermionField &in, FermionField &out) ;
virtual void M(const FermionField &in, FermionField &out) ;
virtual void Mdag(const FermionField &in, FermionField &out) ;
private:
RealD mu; // TwistedMass parameter

View File

@ -280,16 +280,20 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
#endif
}
assert(0 && " Kernel optimisation case not covered ");
}
@ -318,13 +322,19 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
if( interior && exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
#endif
} else if( interior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
#endif
} else if( exterior ) {
if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
#ifndef GRID_CUDA
if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
#endif
}
}

View File

@ -332,7 +332,8 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
/////////////////////////////
{
GRID_TRACE("Gather");
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
st.HaloExchangeOptGather(in,compressor);
accelerator_barrier();
}
std::vector<std::vector<CommsRequest_t> > requests;

View File

@ -423,14 +423,14 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
#define KERNEL_CALL_EXT(A) \
const uint64_t NN = Nsite*Ls; \
const uint64_t sz = st.surface_list.size(); \
auto ptr = &st.surface_list[0]; \
accelerator_forNB( ss, sz, Simd::Nsimd(), { \
int sF = ptr[ss]; \
int sU = sF/Ls; \
int sU = ss/Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
}); \
accelerator_barrier();
});
#define ASM_CALL(A) \
thread_for( sss, Nsite, { \
@ -474,10 +474,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif
} else if( exterior ) {
// dependent on result of merge
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
#endif
@ -507,10 +506,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
#endif
} else if( exterior ) {
// Dependent on result of merge
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
#endif

View File

@ -93,25 +93,5 @@ void WilsonTMFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &ou
RealD b = tm /sq;
axpibg5x(out,in,a,b);
}
template<class Impl>
void WilsonTMFermion<Impl>::M(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
this->Dhop(in, out, DaggerNo);
FermionField tmp(out.Grid());
RealD a = 4.0+this->mass;
RealD b = this->mu;
axpibg5x(tmp,in,a,b);
axpy(out, 1.0, tmp, out);
}
template<class Impl>
void WilsonTMFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
this->Dhop(in, out, DaggerYes);
FermionField tmp(out.Grid());
RealD a = 4.0+this->mass;
RealD b = -this->mu;
axpibg5x(tmp,in,a,b);
axpy(out, 1.0, tmp, out);
}
NAMESPACE_END(Grid);

View File

@ -1 +0,0 @@
../WilsonCloverFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonTMFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
#define IMPLEMENTATION SpWilsonImplD

View File

@ -1 +0,0 @@
../WilsonCloverFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonTMFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
#define IMPLEMENTATION SpWilsonImplF

View File

@ -1 +0,0 @@
#define IMPLEMENTATION SpWilsonTwoIndexAntiSymmetricImplD

View File

@ -1 +0,0 @@
#define IMPLEMENTATION SpWilsonTwoIndexAntiSymmetricImplF

View File

@ -1 +0,0 @@
#define IMPLEMENTATION SpWilsonTwoIndexSymmetricImplD

View File

@ -1 +0,0 @@
#define IMPLEMENTATION SpWilsonTwoIndexSymmetricImplF

View File

@ -10,18 +10,12 @@ WILSON_IMPL_LIST=" \
WilsonImplF \
WilsonImplD \
WilsonImplD2 \
SpWilsonImplF \
SpWilsonImplD \
WilsonAdjImplF \
WilsonAdjImplD \
WilsonTwoIndexSymmetricImplF \
WilsonTwoIndexSymmetricImplD \
WilsonTwoIndexAntiSymmetricImplF \
WilsonTwoIndexAntiSymmetricImplD \
SpWilsonTwoIndexAntiSymmetricImplF \
SpWilsonTwoIndexAntiSymmetricImplD \
SpWilsonTwoIndexSymmetricImplF \
SpWilsonTwoIndexSymmetricImplD \
GparityWilsonImplF \
GparityWilsonImplD "

View File

@ -39,9 +39,6 @@ NAMESPACE_BEGIN(Grid);
typedef WilsonGaugeAction<PeriodicGimplR> WilsonGaugeActionR;
typedef WilsonGaugeAction<PeriodicGimplF> WilsonGaugeActionF;
typedef WilsonGaugeAction<PeriodicGimplD> WilsonGaugeActionD;
typedef WilsonGaugeAction<SpPeriodicGimplR> SpWilsonGaugeActionR;
typedef WilsonGaugeAction<SpPeriodicGimplF> SpWilsonGaugeActionF;
typedef WilsonGaugeAction<SpPeriodicGimplD> SpWilsonGaugeActionD;
typedef PlaqPlusRectangleAction<PeriodicGimplR> PlaqPlusRectangleActionR;
typedef PlaqPlusRectangleAction<PeriodicGimplF> PlaqPlusRectangleActionF;
typedef PlaqPlusRectangleAction<PeriodicGimplD> PlaqPlusRectangleActionD;

View File

@ -61,7 +61,7 @@ NAMESPACE_BEGIN(Grid);
typedef typename Impl::Field Field;
// hardcodes the exponential approximation in the template
template <class S, int Nrepresentation = Nc, int Nexp = 12, class Group = SU<Nc> > class GaugeImplTypes {
template <class S, int Nrepresentation = Nc, int Nexp = 12 > class GaugeImplTypes {
public:
typedef S Simd;
typedef typename Simd::scalar_type scalar_type;
@ -78,6 +78,8 @@ public:
typedef Lattice<SiteLink> LinkField;
typedef Lattice<SiteField> Field;
typedef SU<Nrepresentation> Group;
// Guido: we can probably separate the types from the HMC functions
// this will create 2 kind of implementations
// probably confusing the users
@ -117,7 +119,6 @@ public:
//
LinkField Pmu(P.Grid());
Pmu = Zero();
for (int mu = 0; mu < Nd; mu++) {
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR) ;
@ -125,12 +126,8 @@ public:
PokeIndex<LorentzIndex>(P, Pmu, mu);
}
}
static inline Field projectForce(Field &P) {
Field ret(P.Grid());
Group::taProj(P, ret);
return ret;
}
static inline Field projectForce(Field &P) { return Ta(P); }
static inline void update_field(Field& P, Field& U, double ep){
//static std::chrono::duration<double> diff;
@ -140,15 +137,14 @@ public:
autoView(P_v,P,AcceleratorRead);
accelerator_for(ss, P.Grid()->oSites(),1,{
for (int mu = 0; mu < Nd; mu++) {
U_v[ss](mu) = Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu);
U_v[ss](mu) = Group::ProjectOnGeneralGroup(U_v[ss](mu));
U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu));
}
});
//auto end = std::chrono::high_resolution_clock::now();
// diff += end - start;
// std::cout << "Time to exponentiate matrix " << diff.count() << " s\n";
}
static inline RealD FieldSquareNorm(Field& U){
LatticeComplex Hloc(U.Grid());
Hloc = Zero();
@ -161,7 +157,7 @@ public:
}
static inline void Project(Field &U) {
Group::ProjectOnSpecialGroup(U);
ProjectSUn(U);
}
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
@ -175,7 +171,6 @@ public:
static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
Group::ColdConfiguration(pRNG, U);
}
};
@ -183,17 +178,10 @@ typedef GaugeImplTypes<vComplex, Nc> GimplTypesR;
typedef GaugeImplTypes<vComplexF, Nc> GimplTypesF;
typedef GaugeImplTypes<vComplexD, Nc> GimplTypesD;
typedef GaugeImplTypes<vComplex, Nc, 12, Sp<Nc> > SpGimplTypesR;
typedef GaugeImplTypes<vComplexF, Nc, 12, Sp<Nc> > SpGimplTypesF;
typedef GaugeImplTypes<vComplexD, Nc, 12, Sp<Nc> > SpGimplTypesD;
typedef GaugeImplTypes<vComplex, SU<Nc>::AdjointDimension> GimplAdjointTypesR;
typedef GaugeImplTypes<vComplexF, SU<Nc>::AdjointDimension> GimplAdjointTypesF;
typedef GaugeImplTypes<vComplexD, SU<Nc>::AdjointDimension> GimplAdjointTypesD;
NAMESPACE_END(Grid);
#endif // GRID_GAUGE_IMPL_TYPES_H

View File

@ -176,7 +176,7 @@ public:
return PeriodicBC::CshiftLink(Link,mu,shift);
}
static inline void setDirections(const std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
static inline void setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
static inline std::vector<int> getDirections(void) { return _conjDirs; }
static inline bool isPeriodicGaugeField(void) { return false; }
};
@ -193,11 +193,6 @@ typedef ConjugateGaugeImpl<GimplTypesR> ConjugateGimplR; // Real.. whichever pre
typedef ConjugateGaugeImpl<GimplTypesF> ConjugateGimplF; // Float
typedef ConjugateGaugeImpl<GimplTypesD> ConjugateGimplD; // Double
typedef PeriodicGaugeImpl<SpGimplTypesR> SpPeriodicGimplR; // Real.. whichever prec
typedef PeriodicGaugeImpl<SpGimplTypesF> SpPeriodicGimplF; // Float
typedef PeriodicGaugeImpl<SpGimplTypesD> SpPeriodicGimplD; // Double
NAMESPACE_END(Grid);
#endif

View File

@ -43,7 +43,7 @@ public:
private:
RealD c_plaq;
RealD c_rect;
typename WilsonLoops<Gimpl>::StapleAndRectStapleAllWorkspace workspace;
public:
PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){};
@ -79,18 +79,27 @@ public:
GridBase *grid = Umu.Grid();
std::vector<GaugeLinkField> U (Nd,grid);
std::vector<GaugeLinkField> U2(Nd,grid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
}
std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);
GaugeLinkField dSdU_mu(grid);
GaugeLinkField staple(grid);
for (int mu=0; mu < Nd; mu++){
dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
// Staple in direction mu
WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
dSdU_mu = Ta(U[mu]*staple)*factor_p;
WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
}

View File

@ -86,13 +86,8 @@ public:
assert(ForceE.Checkerboard()==Even);
assert(ForceO.Checkerboard()==Odd);
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorSetCheckerboard(Force,ForceE);
acceleratorSetCheckerboard(Force,ForceO);
#else
setCheckerboard(Force,ForceE);
setCheckerboard(Force,ForceO);
#endif
Force=-Force;
delete forcecb;
@ -135,13 +130,8 @@ public:
assert(ForceE.Checkerboard()==Even);
assert(ForceO.Checkerboard()==Odd);
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorSetCheckerboard(Force,ForceE);
acceleratorSetCheckerboard(Force,ForceO);
#else
setCheckerboard(Force,ForceE);
setCheckerboard(Force,ForceO);
#endif
Force=-Force;
delete forcecb;

View File

@ -38,73 +38,91 @@ NAMESPACE_BEGIN(Grid);
// cf. GeneralEvenOddRational.h for details
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class ImplD, class ImplF>
template<class ImplD, class ImplF, class ImplD2>
class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> {
private:
typedef typename ImplD2::FermionField FermionFieldD2;
typedef typename ImplD::FermionField FermionFieldD;
typedef typename ImplF::FermionField FermionFieldF;
FermionOperator<ImplD> & NumOpD;
FermionOperator<ImplD> & DenOpD;
FermionOperator<ImplD2> & NumOpD2;
FermionOperator<ImplD2> & DenOpD2;
FermionOperator<ImplF> & NumOpF;
FermionOperator<ImplF> & DenOpF;
Integer ReliableUpdateFreq;
protected:
//Action evaluation
//Allow derived classes to override the multishift CG
virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
#if 1
#if 0
SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD);
ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx);
msCG(schurOp,in, out);
#else
SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2);
SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
FermionFieldD inD(NumOpD.FermionRedBlackGrid());
FermionFieldD outD(NumOpD.FermionRedBlackGrid());
FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
// Action better with higher precision?
ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
msCG(schurOpD, in, out);
ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
precisionChange(inD2,in);
std::cout << "msCG single solve "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
msCG(schurOpD2, inD2, outD2);
precisionChange(out,outD2);
#endif
}
//Force evaluation
virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2);
SchurDifferentiableOperator<ImplF> schurOpF (numerator ? NumOpF : DenOpF);
FermionFieldD inD(NumOpD.FermionRedBlackGrid());
FermionFieldD outD(NumOpD.FermionRedBlackGrid());
std::vector<FermionFieldD> out_elemsD(out_elems.size(),NumOpD.FermionRedBlackGrid());
ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
msCG(schurOpD, in, out_elems, out);
FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
std::vector<FermionFieldD2> out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid());
ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
precisionChange(inD2,in);
std::cout << "msCG in "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
msCG(schurOpD2, inD2, out_elemsD2, outD2);
precisionChange(out,outD2);
for(int i=0;i<out_elems.size();i++){
precisionChange(out_elems[i],out_elemsD2[i]);
}
}
//Allow derived classes to override the gauge import
virtual void ImportGauge(const typename ImplD::GaugeField &Ud){
typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
typename ImplD2::GaugeField Ud2(NumOpD2.GaugeGrid());
precisionChange(Uf, Ud);
precisionChange(Ud2, Ud);
std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " <<std::endl;
std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " << norm2(Ud2)<<std::endl;
NumOpD.ImportGauge(Ud);
DenOpD.ImportGauge(Ud);
NumOpF.ImportGauge(Uf);
DenOpF.ImportGauge(Uf);
NumOpD2.ImportGauge(Ud2);
DenOpD2.ImportGauge(Ud2);
}
public:
GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD> &_NumOpD, FermionOperator<ImplD> &_DenOpD,
FermionOperator<ImplF> &_NumOpF, FermionOperator<ImplF> &_DenOpF,
FermionOperator<ImplD2> &_NumOpD2, FermionOperator<ImplD2> &_DenOpD2,
const RationalActionParams & p, Integer _ReliableUpdateFreq
) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p),
ReliableUpdateFreq(_ReliableUpdateFreq),
NumOpD(_NumOpD), DenOpD(_DenOpD),
NumOpF(_NumOpF), DenOpF(_DenOpF)
NumOpF(_NumOpF), DenOpF(_DenOpF),
NumOpD2(_NumOpD2), DenOpD2(_DenOpD2)
{}
virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";}

View File

@ -67,9 +67,9 @@ NAMESPACE_BEGIN(Grid);
virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
};
template<class Impl,class ImplF>
template<class Impl,class ImplF,class ImplD2>
class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction
: public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF> {
: public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2> {
public:
typedef OneFlavourRationalParams Params;
private:
@ -91,9 +91,11 @@ NAMESPACE_BEGIN(Grid);
FermionOperator<Impl> &_DenOp,
FermionOperator<ImplF> &_NumOpF,
FermionOperator<ImplF> &_DenOpF,
FermionOperator<ImplD2> &_NumOpD2,
FermionOperator<ImplD2> &_DenOpD2,
const Params & p, Integer ReliableUpdateFreq
) :
GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF>(_NumOp, _DenOp,_NumOpF, _DenOpF, transcribe(p),ReliableUpdateFreq){}
GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2>(_NumOp, _DenOp,_NumOpF, _DenOpF,_NumOpD2, _DenOpD2, transcribe(p),ReliableUpdateFreq){}
virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
};

View File

@ -207,27 +207,20 @@ NAMESPACE_BEGIN(Grid);
//X = (Mdag M)^-1 V^dag phi
//Y = (Mdag)^-1 V^dag phi
Vpc.MpcDag(PhiOdd,Y); // Y= Vdag phi
std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl;
X=Zero();
DerivativeSolver(Mpc,Y,X); // X= (MdagM)^-1 Vdag phi
std::cout << GridLogMessage <<" X "<<norm2(X)<<std::endl;
Mpc.Mpc(X,Y); // Y= Mdag^-1 Vdag phi
std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl;
// phi^dag V (Mdag M)^-1 dV^dag phi
Vpc.MpcDagDeriv(force , X, PhiOdd ); dSdU = force;
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
// phi^dag dV (Mdag M)^-1 V^dag phi
Vpc.MpcDeriv(force , PhiOdd, X ); dSdU = dSdU+force;
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
// - phi^dag V (Mdag M)^-1 Mdag dM (Mdag M)^-1 V^dag phi
// - phi^dag V (Mdag M)^-1 dMdag M (Mdag M)^-1 V^dag phi
Mpc.MpcDeriv(force,Y,X); dSdU = dSdU-force;
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
Mpc.MpcDagDeriv(force,X,Y); dSdU = dSdU-force;
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
// FIXME No force contribution from EvenEven assumed here
// Needs a fix for clover.

View File

@ -225,18 +225,6 @@ template <class RepresentationsPolicy,
using GenericHMCRunnerHirep =
HMCWrapperTemplate<PeriodicGimplR, Integrator, RepresentationsPolicy>;
// sp2n
template <template <typename, typename, typename> class Integrator>
using GenericSpHMCRunner = HMCWrapperTemplate<SpPeriodicGimplR, Integrator>;
template <class RepresentationsPolicy,
template <typename, typename, typename> class Integrator>
using GenericSpHMCRunnerHirep =
HMCWrapperTemplate<SpPeriodicGimplR, Integrator, RepresentationsPolicy>;
template <class Implementation, class RepresentationsPolicy,
template <typename, typename, typename> class Integrator>
using GenericHMCRunnerTemplate = HMCWrapperTemplate<Implementation, Integrator, RepresentationsPolicy>;

View File

@ -283,13 +283,12 @@ public:
std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
TheIntegrator.print_timer();
TheIntegrator.Smearer.set_Field(Ucur);
for (int obs = 0; obs < Observables.size(); obs++) {
std::cout << GridLogDebug << "Observables # " << obs << std::endl;
std::cout << GridLogDebug << "Observables total " << Observables.size() << std::endl;
std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
Observables[obs]->TrajectoryComplete(traj + 1, TheIntegrator.Smearer, sRNG, pRNG);
Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
}
std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
}

View File

@ -35,16 +35,13 @@ class CheckpointerParameters : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(CheckpointerParameters,
std::string, config_prefix,
std::string, smeared_prefix,
std::string, rng_prefix,
int, saveInterval,
bool, saveSmeared,
std::string, format, );
CheckpointerParameters(std::string cf = "cfg", std::string sf="cfg_smr" , std::string rn = "rng",
CheckpointerParameters(std::string cf = "cfg", std::string rn = "rng",
int savemodulo = 1, const std::string &f = "IEEE64BIG")
: config_prefix(cf),
smeared_prefix(sf),
rng_prefix(rn),
saveInterval(savemodulo),
format(f){};
@ -64,21 +61,13 @@ template <class Impl>
class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> {
public:
void build_filenames(int traj, CheckpointerParameters &Params,
std::string &conf_file,
std::string &smear_file,
std::string &rng_file) {
std::string &conf_file, std::string &rng_file) {
{
std::ostringstream os;
os << Params.rng_prefix << "." << traj;
rng_file = os.str();
}
{
std::ostringstream os;
os << Params.smeared_prefix << "." << traj;
smear_file = os.str();
}
{
std::ostringstream os;
os << Params.config_prefix << "." << traj;
@ -95,11 +84,6 @@ public:
}
virtual void initialize(const CheckpointerParameters &Params) = 0;
virtual void TrajectoryComplete(int traj,
typename Impl::Field &U,
GridSerialRNG &sRNG,
GridParallelRNG &pRNG) { assert(0); } ; // HMC should pass the smart config with smeared and unsmeared
virtual void CheckpointRestore(int traj, typename Impl::Field &U,
GridSerialRNG &sRNG,
GridParallelRNG &pRNG) = 0;

View File

@ -61,14 +61,11 @@ public:
fout.close();
}
void TrajectoryComplete(int traj,
ConfigurationBase<Field> &SmartConfig,
GridSerialRNG &sRNG, GridParallelRNG &pRNG)
{
void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
if ((traj % Params.saveInterval) == 0) {
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng);
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
uint32_t nersc_csum;
uint32_t scidac_csuma;
@ -77,15 +74,9 @@ public:
BinarySimpleUnmunger<sobj_double, sobj> munge;
truncate(rng);
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
std::cout << GridLogMessage << "Written Binary RNG " << rng
<< " checksum " << std::hex
<< nersc_csum <<"/"
<< scidac_csuma <<"/"
<< scidac_csumb
<< std::dec << std::endl;
truncate(config);
BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(false), config, munge, 0, Params.format,
BinaryIO::writeLatticeObject<vobj, sobj_double>(U, config, munge, 0, Params.format,
nersc_csum,scidac_csuma,scidac_csumb);
std::cout << GridLogMessage << "Written Binary Configuration " << config
@ -94,18 +85,6 @@ public:
<< scidac_csuma <<"/"
<< scidac_csumb
<< std::dec << std::endl;
if ( Params.saveSmeared ) {
truncate(smr);
BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(true), smr, munge, 0, Params.format,
nersc_csum,scidac_csuma,scidac_csumb);
std::cout << GridLogMessage << "Written Binary Smeared Configuration " << smr
<< " checksum " << std::hex
<< nersc_csum <<"/"
<< scidac_csuma <<"/"
<< scidac_csumb
<< std::dec << std::endl;
}
}
};

View File

@ -69,27 +69,17 @@ public:
}
}
void TrajectoryComplete(int traj,
ConfigurationBase<GaugeField> &SmartConfig,
GridSerialRNG &sRNG,
void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
if ((traj % Params.saveInterval) == 0) {
std::string config, rng, smr;
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
GridBase *grid = SmartConfig.get_U(false).Grid();
GridBase *grid = U.Grid();
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
std::cout << GridLogMessage << "Written BINARY RNG " << rng
<< " checksum " << std::hex
<< nersc_csum<<"/"
<< scidac_csuma<<"/"
<< scidac_csumb
<< std::dec << std::endl;
IldgWriter _IldgWriter(grid->IsBoss());
_IldgWriter.open(config);
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(false), traj, config, config);
_IldgWriter.writeConfiguration<GaugeStats>(U, traj, config, config);
_IldgWriter.close();
std::cout << GridLogMessage << "Written ILDG Configuration on " << config
@ -98,21 +88,6 @@ public:
<< scidac_csuma<<"/"
<< scidac_csumb
<< std::dec << std::endl;
if ( Params.saveSmeared ) {
IldgWriter _IldgWriter(grid->IsBoss());
_IldgWriter.open(smr);
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
_IldgWriter.close();
std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
<< " checksum " << std::hex
<< nersc_csum<<"/"
<< scidac_csuma<<"/"
<< scidac_csumb
<< std::dec << std::endl;
}
}
};

View File

@ -52,29 +52,23 @@ public:
Params.format = "IEEE64BIG"; // fixed, overwrite any other choice
}
virtual void TrajectoryComplete(int traj,
ConfigurationBase<GaugeField> &SmartConfig,
GridSerialRNG &sRNG,
GridParallelRNG &pRNG)
{
void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
if ((traj % Params.saveInterval) == 0) {
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng);
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
int precision32 = 1;
int tworow = 0;
NerscIO::writeRNGState(sRNG, pRNG, rng);
NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(false), config, tworow, precision32);
if ( Params.saveSmeared ) {
NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(true), smr, tworow, precision32);
}
NerscIO::writeConfiguration<GaugeStats>(U, config, tworow, precision32);
}
};
void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng );
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
this->check_filename(rng);
this->check_filename(config);

View File

@ -70,37 +70,19 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
}
}
void TrajectoryComplete(int traj,
ConfigurationBase<Field> &SmartConfig,
GridSerialRNG &sRNG,
void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
if ((traj % Params.saveInterval) == 0) {
std::string config, rng,smr;
this->build_filenames(traj, Params, config, smr, rng);
GridBase *grid = SmartConfig.get_U(false).Grid();
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
GridBase *grid = U.Grid();
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
std::cout << GridLogMessage << "Written Binary RNG " << rng
<< " checksum " << std::hex
<< nersc_csum <<"/"
<< scidac_csuma <<"/"
<< scidac_csumb
<< std::dec << std::endl;
ScidacWriter _ScidacWriter(grid->IsBoss());
_ScidacWriter.open(config);
_ScidacWriter.writeScidacFieldRecord(U, MData);
_ScidacWriter.close();
{
ScidacWriter _ScidacWriter(grid->IsBoss());
_ScidacWriter.open(config);
_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(false), MData);
_ScidacWriter.close();
}
if ( Params.saveSmeared ) {
ScidacWriter _ScidacWriter(grid->IsBoss());
_ScidacWriter.open(smr);
_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(true), MData);
_ScidacWriter.close();
}
std::cout << GridLogMessage << "Written Scidac Configuration on " << config << std::endl;
}
};

View File

@ -66,7 +66,6 @@ public:
template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy>
class Integrator {
protected:
public:
typedef FieldImplementation_ FieldImplementation;
typedef typename FieldImplementation::Field MomentaField; //for readability
typedef typename FieldImplementation::Field Field;
@ -87,8 +86,6 @@ public:
const ActionSet<Field, RepresentationPolicy> as;
ActionSet<Field,RepresentationPolicy> LevelForces;
//Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default
static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){
static MomentumFilterNone<MomentaField> filter;
@ -99,6 +96,7 @@ public:
{
t_P[level] += ep;
update_P(P, U, level, ep);
std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl;
}
@ -126,33 +124,35 @@ public:
// input U actually not used in the fundamental case
// Fundamental updates, include smearing
assert(as.size()==LevelForces.size());
Field level_force(U.Grid()); level_force =Zero();
for (int a = 0; a < as[level].actions.size(); ++a) {
double start_full = usecond();
Field force(U.Grid());
conformable(U.Grid(), Mom.Grid());
Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
double start_force = usecond();
std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl;
as[level].actions.at(a)->deriv_timer_start();
as[level].actions.at(a)->deriv(Smearer, force); // deriv should NOT include Ta
as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta
as[level].actions.at(a)->deriv_timer_stop();
std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl;
std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
auto name = as[level].actions.at(a)->action_name();
if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
force = FieldImplementation::projectForce(force); // Ta for gauge fields
double end_force = usecond();
// DumpSliceNorm("force ",force,Nd-1);
MomFilter->applyFilter(force);
std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<< std::endl;
// track the total
level_force = level_force+force;
DumpSliceNorm("force filtered ",force,Nd-1);
Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x])
Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;
@ -175,16 +175,6 @@ public:
}
{
// total force
Real force_abs = std::sqrt(norm2(level_force)/U.Grid()->gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x])
Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;
Real force_max = std::sqrt(maxLocalNorm2(level_force));
Real impulse_max = force_max * ep * HMC_MOMENTUM_DENOMINATOR;
LevelForces[level].actions.at(0)->deriv_log(force_abs,force_max,impulse_abs,impulse_max);
}
// Force from the other representations
as[level].apply(update_P_hireps, Representations, Mom, U, ep);
@ -234,16 +224,6 @@ public:
//Default the momentum filter to "do-nothing"
MomFilter = getDefaultMomFilter();
for (int level = 0; level < as.size(); ++level) {
int multiplier = as.at(level).multiplier;
ActionLevel<Field, RepresentationPolicy> * Level = new ActionLevel<Field, RepresentationPolicy>(multiplier);
Level->push_back(new EmptyAction<Field>);
LevelForces.push_back(*Level);
// does it copy by value or reference??
// - answer it copies by value, BUT the action level contains a reference that is NOT updated.
// Unsafe code in Guido's area
}
};
virtual ~Integrator() {}
@ -261,14 +241,10 @@ public:
void reset_timer(void)
{
assert(as.size()==LevelForces.size());
for (int level = 0; level < as.size(); ++level) {
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
as[level].actions.at(actionID)->reset_timer();
}
int actionID=0;
assert(LevelForces.at(level).actions.size()==1);
LevelForces.at(level).actions.at(actionID)->reset_timer();
}
}
void print_timer(void)
@ -330,16 +306,6 @@ public:
<<" calls " << as[level].actions.at(actionID)->deriv_num
<< std::endl;
}
int actionID=0;
std::cout << GridLogMessage
<< LevelForces[level].actions.at(actionID)->action_name()
<<"["<<level<<"]["<< actionID<<"] :\n\t\t "
<<" force max " << LevelForces[level].actions.at(actionID)->deriv_max_average()
<<" norm " << LevelForces[level].actions.at(actionID)->deriv_norm_average()
<<" Fdt max " << LevelForces[level].actions.at(actionID)->Fdt_max_average()
<<" Fdt norm " << LevelForces[level].actions.at(actionID)->Fdt_norm_average()
<<" calls " << LevelForces[level].actions.at(actionID)->deriv_num
<< std::endl;
}
std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
}
@ -361,13 +327,6 @@ public:
std::cout << as[level].actions.at(actionID)->LogParameters();
}
}
std::cout << " [Integrator] Total Force loggers: "<< LevelForces.size() <<std::endl;
for (int level = 0; level < LevelForces.size(); ++level) {
std::cout << GridLogMessage << "[Integrator] ---- Level: "<< level << std::endl;
for (int actionID = 0; actionID < LevelForces[level].actions.size(); ++actionID) {
std::cout << GridLogMessage << "["<< LevelForces[level].actions.at(actionID)->action_name() << "] ID: " << actionID << std::endl;
}
}
std::cout << GridLogMessage << ":::::::::::::::::::::::::::::::::::::::::"<< std::endl;
}
@ -418,9 +377,14 @@ public:
auto name = as[level].actions.at(actionID)->action_name();
std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl;
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl;
as[level].actions.at(actionID)->refresh_timer_start();
as[level].actions.at(actionID)->refresh(Smearer, sRNG, pRNG);
as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
as[level].actions.at(actionID)->refresh_timer_stop();
std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl;
}
@ -449,7 +413,6 @@ public:
RealD S(Field& U)
{ // here also U not used
assert(as.size()==LevelForces.size());
std::cout << GridLogIntegrator << "Integrator action\n";
RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom
@ -462,9 +425,10 @@ public:
// get gauge field from the SmearingPolicy and
// based on the boolean is_smeared in actionID
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
as[level].actions.at(actionID)->S_timer_start();
Hterm = as[level].actions.at(actionID)->S(Smearer);
Hterm = as[level].actions.at(actionID)->S(Us);
as[level].actions.at(actionID)->S_timer_stop();
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
H += Hterm;
@ -505,11 +469,12 @@ public:
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
// get gauge field from the SmearingPolicy and
// based on the boolean is_smeared in actionID
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
as[level].actions.at(actionID)->S_timer_start();
as[level].actions.at(actionID)->S_timer_start();
Hterm = as[level].actions.at(actionID)->S(Smearer);
as[level].actions.at(actionID)->S_timer_stop();
Hterm = as[level].actions.at(actionID)->Sinitial(Us);
as[level].actions.at(actionID)->S_timer_stop();
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
H += Hterm;

View File

@ -34,13 +34,6 @@ NAMESPACE_BEGIN(Grid);
template <class Field>
class HmcObservable {
public:
virtual void TrajectoryComplete(int traj,
ConfigurationBase<Field> &SmartConfig,
GridSerialRNG &sRNG,
GridParallelRNG &pRNG)
{
TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable
};
virtual void TrajectoryComplete(int traj,
Field &U,
GridSerialRNG &sRNG,

View File

@ -42,18 +42,6 @@ public:
// necessary for HmcObservable compatibility
typedef typename Impl::Field Field;
virtual void TrajectoryComplete(int traj,
ConfigurationBase<Field> &SmartConfig,
GridSerialRNG &sRNG,
GridParallelRNG &pRNG)
{
std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl;
std::cout << GridLogMessage << "Unsmeared plaquette"<<std::endl;
TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable
std::cout << GridLogMessage << "Smeared plaquette"<<std::endl;
TrajectoryComplete(traj,SmartConfig.get_U(true),sRNG,pRNG); // Unsmeared observable
std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl;
};
void TrajectoryComplete(int traj,
Field &U,
GridSerialRNG &sRNG,

View File

@ -13,7 +13,7 @@ NAMESPACE_BEGIN(Grid);
* Empty since HMC updates already the fundamental representation
*/
template <int ncolour, class group_name>
template <int ncolour>
class FundamentalRep {
public:
static const int Dimension = ncolour;
@ -21,7 +21,7 @@ public:
// typdef to be used by the Representations class in HMC to get the
// types for the higher representation fields
typedef typename GaugeGroup<ncolour,group_name>::LatticeMatrix LatticeMatrix;
typedef typename SU<ncolour>::LatticeMatrix LatticeMatrix;
typedef LatticeGaugeField LatticeField;
explicit FundamentalRep(GridBase* grid) {} //do nothing
@ -45,8 +45,7 @@ public:
typedef FundamentalRep<Nc,GroupName::SU> FundamentalRepresentation;
typedef FundamentalRep<Nc,GroupName::Sp> SpFundamentalRepresentation;
typedef FundamentalRep<Nc> FundamentalRepresentation;
NAMESPACE_END(Grid);

View File

@ -20,14 +20,14 @@ NAMESPACE_BEGIN(Grid);
* in the SUnTwoIndex.h file
*/
template <int ncolour, TwoIndexSymmetry S, class group_name = GroupName::SU>
template <int ncolour, TwoIndexSymmetry S>
class TwoIndexRep {
public:
// typdef to be used by the Representations class in HMC to get the
// types for the higher representation fields
typedef typename GaugeGroupTwoIndex<ncolour, S, group_name>::LatticeTwoIndexMatrix LatticeMatrix;
typedef typename GaugeGroupTwoIndex<ncolour, S, group_name>::LatticeTwoIndexField LatticeField;
static const int Dimension = GaugeGroupTwoIndex<ncolour,S,group_name>::Dimension;
typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexMatrix LatticeMatrix;
typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexField LatticeField;
static const int Dimension = ncolour * (ncolour + S) / 2;
static const bool isFundamental = false;
LatticeField U;
@ -43,10 +43,10 @@ public:
U = Zero();
LatticeColourMatrix tmp(Uin.Grid());
Vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
Vector<typename SU<ncolour>::Matrix> eij(Dimension);
for (int a = 0; a < Dimension; a++)
GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]);
SU_TwoIndex<ncolour, S>::base(a, eij[a]);
for (int mu = 0; mu < Nd; mu++) {
auto Uin_mu = peekLorentz(Uin, mu);
@ -71,7 +71,7 @@ public:
out_mu = Zero();
typename GaugeGroup<ncolour, group_name>::LatticeAlgebraVector h(in.Grid());
typename SU<ncolour>::LatticeAlgebraVector h(in.Grid());
projectOnAlgebra(h, in_mu, double(Nc + 2 * S)); // factor T(r)/T(fund)
FundamentalLieAlgebraMatrix(h, out_mu); // apply scale only once
pokeLorentz(out, out_mu, mu);
@ -80,23 +80,20 @@ public:
}
private:
void projectOnAlgebra(typename GaugeGroup<ncolour, group_name>::LatticeAlgebraVector &h_out,
void projectOnAlgebra(typename SU<ncolour>::LatticeAlgebraVector &h_out,
const LatticeMatrix &in, Real scale = 1.0) const {
GaugeGroupTwoIndex<ncolour, S,group_name>::projectOnAlgebra(h_out, in, scale);
SU_TwoIndex<ncolour, S>::projectOnAlgebra(h_out, in, scale);
}
void FundamentalLieAlgebraMatrix(
typename GaugeGroup<ncolour, group_name>::LatticeAlgebraVector &h,
typename GaugeGroup<ncolour, group_name>::LatticeMatrix &out, Real scale = 1.0) const {
GaugeGroup<ncolour,group_name>::FundamentalLieAlgebraMatrix(h, out, scale);
typename SU<ncolour>::LatticeAlgebraVector &h,
typename SU<ncolour>::LatticeMatrix &out, Real scale = 1.0) const {
SU<ncolour>::FundamentalLieAlgebraMatrix(h, out, scale);
}
};
typedef TwoIndexRep<Nc, Symmetric, GroupName::SU> TwoIndexSymmetricRepresentation;
typedef TwoIndexRep<Nc, AntiSymmetric, GroupName::SU> TwoIndexAntiSymmetricRepresentation;
typedef TwoIndexRep<Nc, Symmetric, GroupName::Sp> SpTwoIndexSymmetricRepresentation;
typedef TwoIndexRep<Nc, AntiSymmetric, GroupName::Sp> SpTwoIndexAntiSymmetricRepresentation;
typedef TwoIndexRep<Nc, Symmetric> TwoIndexSymmetricRepresentation;
typedef TwoIndexRep<Nc, AntiSymmetric> TwoIndexAntiSymmetricRepresentation;
NAMESPACE_END(Grid);

View File

@ -7,27 +7,26 @@
NAMESPACE_BEGIN(Grid);
//trivial class for no smearing
template< class Impl >
class NoSmearing : public ConfigurationBase<typename Impl::Field>
class NoSmearing
{
public:
INHERIT_FIELD_TYPES(Impl);
Field* ThinLinks;
Field* ThinField;
NoSmearing(): ThinLinks(NULL) {}
NoSmearing(): ThinField(NULL) {}
virtual void set_Field(Field& U) { ThinLinks = &U; }
void set_Field(Field& U) { ThinField = &U; }
virtual void smeared_force(Field&) {}
void smeared_force(Field&) const {}
virtual Field& get_SmearedU() { return *ThinLinks; }
Field& get_SmearedU() { return *ThinField; }
virtual Field &get_U(bool smeared = false)
Field &get_U(bool smeared = false)
{
return *ThinLinks;
return *ThinField;
}
};
@ -43,24 +42,19 @@ public:
It stores a list of smeared configurations.
*/
template <class Gimpl>
class SmearedConfiguration : public ConfigurationBase<typename Gimpl::Field>
class SmearedConfiguration
{
public:
INHERIT_GIMPL_TYPES(Gimpl);
protected:
private:
const unsigned int smearingLevels;
Smear_Stout<Gimpl> *StoutSmearing;
std::vector<GaugeField> SmearedSet;
public:
GaugeField* ThinLinks; /* Pointer to the thin links configuration */ // move to base???
protected:
// Member functions
//====================================================================
// Overridden in masked version
virtual void fill_smearedSet(GaugeField &U)
void fill_smearedSet(GaugeField &U)
{
ThinLinks = &U; // attach the smearing routine to the field U
@ -88,10 +82,9 @@ protected:
}
}
}
//overridden in masked verson
virtual GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
const GaugeField& GaugeK) const
//====================================================================
GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
const GaugeField& GaugeK) const
{
GridBase* grid = GaugeK.Grid();
GaugeField C(grid), SigmaK(grid), iLambda(grid);
@ -220,6 +213,8 @@ protected:
//====================================================================
public:
GaugeField*
ThinLinks; /* Pointer to the thin links configuration */
/* Standard constructor */
SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
@ -235,7 +230,7 @@ public:
: smearingLevels(0), StoutSmearing(nullptr), SmearedSet(), ThinLinks(NULL) {}
// attach the smeared routines to the thin links U and fill the smeared set
virtual void set_Field(GaugeField &U)
void set_Field(GaugeField &U)
{
double start = usecond();
fill_smearedSet(U);
@ -245,7 +240,7 @@ public:
}
//====================================================================
virtual void smeared_force(GaugeField &SigmaTilde)
void smeared_force(GaugeField &SigmaTilde) const
{
if (smearingLevels > 0)
{
@ -272,16 +267,14 @@ public:
}
double end = usecond();
double time = (end - start)/ 1e3;
std::cout << GridLogMessage << " GaugeConfiguration: Smeared Force chain rule took " << time << " ms" << std::endl;
std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl;
} // if smearingLevels = 0 do nothing
SigmaTilde=Gimpl::projectForce(SigmaTilde); // Ta
}
//====================================================================
virtual GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
virtual GaugeField &get_U(bool smeared = false)
GaugeField &get_U(bool smeared = false)
{
// get the config, thin links by default
if (smeared)

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More