From f0dc0f36214c4dda1eace9d83956e5d7fef4f729 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sat, 22 Aug 2020 13:57:33 +0200 Subject: [PATCH 01/56] fix compile issue on Qpace3 --- Grid/lattice/Lattice_transfer.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index e698e40e..91de721f 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -127,6 +127,11 @@ accelerator_inline void convertType(T1 & out, const iScalar & in) { convertType(out,in._internal); } +template::value, T1>::type* = nullptr> +accelerator_inline void convertType(T1 & out, const iScalar & in) { + convertType(out,in._internal); +} + template accelerator_inline void convertType(iScalar & out, const T2 & in) { convertType(out._internal,in); From 1292d595634172e28158f99d31863d63267ca5ac Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Thu, 11 Jun 2020 13:16:00 +0200 Subject: [PATCH 02/56] Add a typedef + broaden interface of CMat --- Grid/algorithms/CoarsenedMatrix.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 8d184aea..76950baf 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -268,6 +268,21 @@ public: typedef iMatrix Cobj; typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice FineField; + typedef CoarseVector FermionField; + + // enrich interface + void Meooe(CoarseVector const& in, CoarseVector& out) { assert(0); } + void MeooeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } + void Mooee(CoarseVector const& in, CoarseVector& out) { assert(0); } + void MooeeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } + void MooeeInv(CoarseVector const& in, CoarseVector& out) { assert(0); } + void MooeeInvDag(CoarseVector const& in, CoarseVector& out) { assert(0); } + void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; } + void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; } + void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; } + void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; } + void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; }; + void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; }; //////////////////// // Data members From dd1ba266b269b093a8bcd9bf815438d9896d7148 Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Fri, 17 Jul 2020 11:58:02 +0200 Subject: [PATCH 03/56] Fix mapping between dir + disp and point in CMat --- Grid/algorithms/CoarsenedMatrix.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 76950baf..d18fba43 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -432,25 +432,25 @@ public: ////////////// // 4D action like wilson - // 0+ => 0 - // 0- => 1 - // 1+ => 2 - // 1- => 3 + // 0+ => 0 + // 0- => 4 + // 1+ => 1 + // 1- => 5 // etc.. ////////////// // 5D action like DWF - // 1+ => 0 - // 1- => 1 - // 2+ => 2 - // 2- => 3 + // 1+ => 0 + // 1- => 4 + // 2+ => 1 + // 2- => 5 // etc.. auto point = [dir, disp, ndim](){ if(dir == 0 and disp == 0) return 8; else if ( ndim==4 ) { - return (4 * dir + 1 - disp) / 2; + return (1 - disp) / 2 * 4 + dir; } else { - return (4 * (dir-1) + 1 - disp) / 2; + return (1 - disp) / 2 * 4 + dir - 1; } }(); From b2087f14c48e881ab041bbaef7f6c444c88a895d Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Mon, 24 Aug 2020 16:54:36 +0200 Subject: [PATCH 04/56] Fix CoarsenedMatrix regarding illegal memory accesses Need a reference to geom since the lambda copies the this pointer which points to host memory, see - https://docs.nvidia.com/cuda/cuda-c-programming-guide/#star-this-capture - https://devblogs.nvidia.com/new-compiler-features-cuda-8/ --- Grid/algorithms/CoarsenedMatrix.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index d18fba43..ba40535c 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -310,6 +310,8 @@ public: Stencil.HaloExchange(in,compressor); autoView( in_v , in, AcceleratorRead); autoView( out_v , out, AcceleratorWrite); + autoView( Stencil_v , Stencil, AcceleratorRead); + auto& geom_v = geom; typedef LatticeView Aview; Vector AcceleratorViewContainer; @@ -331,14 +333,14 @@ public: int ptype; StencilEntry *SE; - for(int point=0;point_is_local) { nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); } acceleratorSynchronise(); @@ -382,6 +384,7 @@ public: autoView( out_v , out, AcceleratorWrite); autoView( in_v , in, AcceleratorRead); + autoView( Stencil_v , Stencil, AcceleratorRead); const int Nsimd = CComplex::Nsimd(); typedef decltype(coalescedRead(in_v[0])) calcVector; @@ -395,12 +398,12 @@ public: int ptype; StencilEntry *SE; - SE=Stencil.GetEntry(ptype,point,ss); + SE=Stencil_v.GetEntry(ptype,point,ss); if(SE->_is_local) { nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); } acceleratorSynchronise(); From 2a75516330925bd05681f4dba639482ca84270d7 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 26 Aug 2020 12:34:17 -0400 Subject: [PATCH 05/56] state MPI/SLURM message only on world_rank zero --- Grid/threads/Accelerator.cc | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 864d90a9..f6df4a31 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -21,22 +21,26 @@ void acceleratorInit(void) #define ENV_RANK_SLURM "SLURM_PROCID" #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" #define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" - // We extract the local rank initialization using an environment variable - if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { - printf("OPENMPI detected\n"); - rank = atoi(localRankStr); - } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) { - printf("MVAPICH detected\n"); - rank = atoi(localRankStr); - } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) { - printf("SLURM detected\n"); - rank = atoi(localRankStr); - } else { - printf("MPI version is unknown - bad things may happen\n"); - } if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);} + // We extract the local rank initialization using an environment variable + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { + if (!world_rank) + printf("OPENMPI detected\n"); + rank = atoi(localRankStr); + } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) { + if (!world_rank) + printf("MVAPICH detected\n"); + rank = atoi(localRankStr); + } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) { + if (!world_rank) + printf("SLURM detected\n"); + rank = atoi(localRankStr); + } else { + if (!world_rank) + printf("MPI version is unknown - bad things may happen\n"); + } size_t totalDeviceMem=0; for (int i = 0; i < nDevices; i++) { From cf3535d16e412adf80ca1a2f7ead54003860c94b Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Thu, 27 Aug 2020 14:06:48 +0200 Subject: [PATCH 06/56] Expose more functions in CMat --- Grid/algorithms/CoarsenedMatrix.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index ba40535c..cdfc2ac3 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -271,6 +271,9 @@ public: typedef CoarseVector FermionField; // enrich interface + void Dhop(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } + void DhopEO(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } + void DhopOE(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } void Meooe(CoarseVector const& in, CoarseVector& out) { assert(0); } void MeooeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } void Mooee(CoarseVector const& in, CoarseVector& out) { assert(0); } From 4d2dc7ba0304397f536a2bc71260266e1475ae83 Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Mon, 7 Sep 2020 17:57:07 +0200 Subject: [PATCH 07/56] Enable even-odd for CoarsenedMatrix --- Grid/algorithms/CoarsenedMatrix.h | 471 +++++++++++++++++++++++--- tests/solver/Test_coarse_even_odd.cc | 475 +++++++++++++++++++++++++++ 2 files changed, 899 insertions(+), 47 deletions(-) create mode 100644 tests/solver/Test_coarse_even_odd.cc diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index cdfc2ac3..66b9c169 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -31,6 +31,7 @@ Author: paboyle #ifndef GRID_ALGORITHM_COARSENED_MATRIX_H #define GRID_ALGORITHM_COARSENED_MATRIX_H +#include // needed for Dagger(Yes|No), Inverse(Yes|No) NAMESPACE_BEGIN(Grid); @@ -59,12 +60,14 @@ inline void blockMaskedInnerProduct(Lattice &CoarseInner, class Geometry { public: int npoint; + int base; std::vector directions ; std::vector displacements; + std::vector points_dagger; Geometry(int _d) { - int base = (_d==5) ? 1:0; + base = (_d==5) ? 1:0; // make coarse grid stencil for 4d , not 5d if ( _d==5 ) _d=4; @@ -72,16 +75,51 @@ public: npoint = 2*_d+1; directions.resize(npoint); displacements.resize(npoint); + points_dagger.resize(npoint); for(int d=0;d<_d;d++){ directions[d ] = d+base; directions[d+_d] = d+base; displacements[d ] = +1; displacements[d+_d]= -1; + points_dagger[d ] = d+_d; + points_dagger[d+_d] = d; } directions [2*_d]=0; displacements[2*_d]=0; + points_dagger[2*_d]=2*_d; } + int point(int dir, int disp) { + assert(disp == -1 || disp == 0 || disp == 1); + assert(base+0 <= dir && dir < base+4); + + // directions faster index = new indexing + // 4d (base = 0): + // point 0 1 2 3 4 5 6 7 8 + // dir 0 1 2 3 0 1 2 3 0 + // disp +1 +1 +1 +1 -1 -1 -1 -1 0 + // 5d (base = 1): + // point 0 1 2 3 4 5 6 7 8 + // dir 1 2 3 4 1 2 3 4 0 + // disp +1 +1 +1 +1 -1 -1 -1 -1 0 + + // displacements faster index = old indexing + // 4d (base = 0): + // point 0 1 2 3 4 5 6 7 8 + // dir 0 0 1 1 2 2 3 3 0 + // disp +1 -1 +1 -1 +1 -1 +1 -1 0 + // 5d (base = 1): + // point 0 1 2 3 4 5 6 7 8 + // dir 1 1 2 2 3 3 4 4 0 + // disp +1 -1 +1 -1 +1 -1 +1 -1 0 + + if(dir == 0 and disp == 0) + return 8; + else // New indexing + return (1 - disp) / 2 * 4 + dir - base; + // else // Old indexing + // return (4 * (dir - base) + 1 - disp) / 2; + } }; template @@ -258,7 +296,7 @@ public: // Fine Object == (per site) type of fine field // nbasis == number of deflation vectors template -class CoarsenedMatrix : public SparseMatrixBase > > { +class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase > > { public: typedef iVector siteVector; @@ -270,16 +308,7 @@ public: typedef Lattice FineField; typedef CoarseVector FermionField; - // enrich interface - void Dhop(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } - void DhopEO(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } - void DhopOE(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } - void Meooe(CoarseVector const& in, CoarseVector& out) { assert(0); } - void MeooeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } - void Mooee(CoarseVector const& in, CoarseVector& out) { assert(0); } - void MooeeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } - void MooeeInv(CoarseVector const& in, CoarseVector& out) { assert(0); } - void MooeeInvDag(CoarseVector const& in, CoarseVector& out) { assert(0); } + // enrich interface, use default implementation as in FermionOperator /////// void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; } void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; } void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; } @@ -292,21 +321,36 @@ public: //////////////////// Geometry geom; GridBase * _grid; + GridBase* _cbgrid; int hermitian; CartesianStencil Stencil; + CartesianStencil StencilEven; + CartesianStencil StencilOdd; std::vector A; - + std::vector Aeven; + std::vector Aodd; + + CoarseMatrix AselfInv; + CoarseMatrix AselfInvEven; + CoarseMatrix AselfInvOdd; + + Vector dag_factor; + /////////////////////// // Interface /////////////////////// GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know + GridBase * RedBlackGrid() { return _cbgrid; }; + + int ConstEE() { return 0; } void M (const CoarseVector &in, CoarseVector &out) { conformable(_grid,in.Grid()); conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); SimpleCompressor compressor; @@ -364,12 +408,72 @@ public: return M(in,out); } else { // corresponds to Galerkin coarsening - CoarseVector tmp(Grid()); - G5C(tmp, in); - M(tmp, out); - G5C(out, out); + return MdagNonHermitian(in, out); } }; + + void MdagNonHermitian(const CoarseVector &in, CoarseVector &out) + { + conformable(_grid,in.Grid()); + conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); + + SimpleCompressor compressor; + + Stencil.HaloExchange(in,compressor); + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); + autoView( Stencil_v , Stencil, AcceleratorRead); + auto& geom_v = geom; + typedef LatticeView Aview; + + Vector AcceleratorViewContainer; + + for(int p=0;poSites(); + + Vector points(geom.npoint, 0); + for(int p=0; poSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb compressor; @@ -379,6 +483,7 @@ public: { conformable(_grid,in.Grid()); conformable(_grid,out.Grid()); + out.Checkerboard() = in.Checkerboard(); typedef LatticeView Aview; Vector AcceleratorViewContainer; @@ -434,34 +539,7 @@ public: this->MdirComms(in); - int ndim = in.Grid()->Nd(); - - ////////////// - // 4D action like wilson - // 0+ => 0 - // 0- => 4 - // 1+ => 1 - // 1- => 5 - // etc.. - ////////////// - // 5D action like DWF - // 1+ => 0 - // 1- => 4 - // 2+ => 1 - // 2- => 5 - // etc.. - auto point = [dir, disp, ndim](){ - if(dir == 0 and disp == 0) - return 8; - else if ( ndim==4 ) { - return (1 - disp) / 2 * 4 + dir; - } else { - return (1 - disp) / 2 * 4 + dir - 1; - } - }(); - - MdirCalc(in,out,point); - + MdirCalc(in,out,geom.point(dir,disp)); }; void Mdiag(const CoarseVector &in, CoarseVector &out) @@ -470,17 +548,269 @@ public: MdirCalc(in, out, point); // No comms }; + void Mooee(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerNo, InverseNo); + } + + void MooeeInv(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerNo, InverseYes); + } + + void MooeeDag(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerYes, InverseNo); + } + + void MooeeInvDag(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerYes, InverseYes); + } + + void Meooe(const CoarseVector &in, CoarseVector &out) { + if(in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerNo); + } else { + DhopOE(in, out, DaggerNo); + } + } + + void MeooeDag(const CoarseVector &in, CoarseVector &out) { + if(in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerYes); + } else { + DhopOE(in, out, DaggerYes); + } + } + + void Dhop(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _grid); // verifies full grid + conformable(in.Grid(), out.Grid()); + + out.Checkerboard() = in.Checkerboard(); + + DhopInternal(Stencil, A, in, out, dag); + } + + void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Even); + out.Checkerboard() = Odd; + + DhopInternal(StencilEven, Aodd, in, out, dag); + } + + void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Odd); + out.Checkerboard() = Even; + + DhopInternal(StencilOdd, Aeven, in, out, dag); + } + + void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) { + out.Checkerboard() = in.Checkerboard(); + assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); + + CoarseMatrix *Aself = nullptr; + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1]; + DselfInternal(StencilOdd, *Aself, in, out, dag); + } else { + Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1]; + DselfInternal(StencilEven, *Aself, in, out, dag); + } + } else { + Aself = (inv) ? &AselfInv : &A[geom.npoint-1]; + DselfInternal(Stencil, *Aself, in, out, dag); + } + assert(Aself != nullptr); + } + + void DselfInternal(CartesianStencil &st, CoarseMatrix &a, + const CoarseVector &in, CoarseVector &out, int dag) { + int point = geom.npoint-1; + autoView( out_v, out, AcceleratorWrite); + autoView( in_v, in, AcceleratorRead); + autoView( st_v, st, AcceleratorRead); + autoView( a_v, a, AcceleratorRead); + + const int Nsimd = CComplex::Nsimd(); + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + + RealD* dag_factor_p = &dag_factor[0]; + + if(dag) { + accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + SE=st_v.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bboSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + SE=st_v.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb &st, std::vector &a, + const CoarseVector &in, CoarseVector &out, int dag) { + SimpleCompressor compressor; + + st.HaloExchange(in,compressor); + autoView( in_v, in, AcceleratorRead); + autoView( out_v, out, AcceleratorWrite); + autoView( st_v , st, AcceleratorRead); + typedef LatticeView Aview; + + // determine in what order we need the points + int npoint = geom.npoint-1; + Vector points(npoint, 0); + for(int p=0; p AcceleratorViewContainer; + for(int p=0;poSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bboSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb > &linop, Aggregation & Subspace) { @@ -629,6 +959,9 @@ public: std::cout << GridLogMessage << " ForceHermitian, new code "<lSites(); + + typedef typename Cobj::scalar_object scalar_object; + + autoView(Aself_v, A[geom.npoint-1], CpuRead); + autoView(AselfInv_v, AselfInv, CpuWrite); + thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke + Eigen::MatrixXcd selfLinkEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis); + Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis); + + scalar_object selfLink = Zero(); + scalar_object selfLinkInv = Zero(); + + Coordinate lcoor; + + Grid()->LocalIndexToLocalCoor(site, lcoor); + peekLocalSite(selfLink, Aself_v, lcoor); + + for (int i = 0; i < nbasis; ++i) + for (int j = 0; j < nbasis; ++j) + selfLinkEigen(i, j) = static_cast(TensorRemove(selfLink(i, j))); + + selfLinkInvEigen = selfLinkEigen.inverse(); + + for(int i = 0; i < nbasis; ++i) + for(int j = 0; j < nbasis; ++j) + selfLinkInv(i, j) = selfLinkInvEigen(i, j); + + pokeLocalSite(selfLinkInv, AselfInv_v, lcoor); + }); + } + + void FillHalfCbs() { + std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl; + for(int p = 0; p < geom.npoint; ++p) { + pickCheckerboard(Even, Aeven[p], A[p]); + pickCheckerboard(Odd, Aodd[p], A[p]); + } + pickCheckerboard(Even, AselfInvEven, AselfInv); + pickCheckerboard(Odd, AselfInvOdd, AselfInv); + } }; NAMESPACE_END(Grid); diff --git a/tests/solver/Test_coarse_even_odd.cc b/tests/solver/Test_coarse_even_odd.cc new file mode 100644 index 00000000..dfbab747 --- /dev/null +++ b/tests/solver/Test_coarse_even_odd.cc @@ -0,0 +1,475 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/solver/Test_coarse_even_odd.cc + + Copyright (C) 2015-2020 + + Author: Daniel Richtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + +#include + +using namespace Grid; + +#ifndef NBASIS +#define NBASIS 40 +#endif + +// NOTE: The tests in this file are written in analogy to +// - tests/core/Test_wilson_even_odd.cc +// - tests/core/Test_wilson_clover.cc + +std::vector readFromCommandlineIvec(int* argc, + char*** argv, + std::string&& option, + const std::vector& defaultValue) { + std::string arg; + std::vector ret(defaultValue); + if(GridCmdOptionExists(*argv, *argv + *argc, option)) { + arg = GridCmdOptionPayload(*argv, *argv + *argc, option); + GridCmdOptionIntVector(arg, ret); + } + return ret; +} + +int main(int argc, char** argv) { + Grid_init(&argc, &argv); + + ///////////////////////////////////////////////////////////////////////////// + // Read from command line // + ///////////////////////////////////////////////////////////////////////////// + + const int nbasis = NBASIS; static_assert((nbasis & 0x1) == 0, ""); + const int nb = nbasis/2; + Coordinate blockSize = readFromCommandlineIvec(&argc, &argv, "--blocksize", {2, 2, 2, 2}); + + std::cout << GridLogMessage << "Compiled with nbasis = " << nbasis << " -> nb = " << nb << std::endl; + + ///////////////////////////////////////////////////////////////////////////// + // General setup // + ///////////////////////////////////////////////////////////////////////////// + + Coordinate clatt = GridDefaultLatt(); + for(int d=0; dshow_decomposition(); + std::cout << GridLogMessage << "Grid_c:" << std::endl; Grid_c->show_decomposition(); + std::cout << GridLogMessage << "RBGrid_f:" << std::endl; RBGrid_f->show_decomposition(); + std::cout << GridLogMessage << "RBGrid_c:" << std::endl; RBGrid_c->show_decomposition(); + + GridParallelRNG pRNG_f(Grid_f); + GridParallelRNG pRNG_c(Grid_c); + + std::vector seeds({1, 2, 3, 4}); + + pRNG_f.SeedFixedIntegers(seeds); + pRNG_c.SeedFixedIntegers(seeds); + + ///////////////////////////////////////////////////////////////////////////// + // Setup of Dirac Matrix and Operator // + ///////////////////////////////////////////////////////////////////////////// + + LatticeGaugeField Umu(Grid_f); SU3::HotConfiguration(pRNG_f, Umu); + + RealD checkTolerance = (getPrecision::value == 1) ? 1e-7 : 1e-15; + + RealD mass = -0.30; + RealD csw = 1.9192; + + WilsonCloverFermionR Dwc(Umu, *Grid_f, *RBGrid_f, mass, csw, csw); + MdagMLinearOperator MdagMOp_Dwc(Dwc); + + ///////////////////////////////////////////////////////////////////////////// + // Type definitions // + ///////////////////////////////////////////////////////////////////////////// + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseDiracMatrix; + typedef CoarseDiracMatrix::CoarseVector CoarseVector; + + ///////////////////////////////////////////////////////////////////////////// + // Setup of Aggregation // + ///////////////////////////////////////////////////////////////////////////// + + Aggregates Aggs(Grid_c, Grid_f, 0); + { + LatticeFermion tmp(Aggs.subspace[0].Grid()); + for(int n = 0; n < nb; n++) { + gaussian(pRNG_f, Aggs.subspace[n]); + G5C(tmp, Aggs.subspace[n]); + axpby(Aggs.subspace[n + nb], 0.5, -0.5, Aggs.subspace[n], tmp); + axpby(Aggs.subspace[n], 0.5, 0.5, Aggs.subspace[n], tmp); + } + } + + ///////////////////////////////////////////////////////////////////////////// + // Setup of CoarsenedMatrix and Operator // + ///////////////////////////////////////////////////////////////////////////// + + const int hermitian = 0; + CoarseDiracMatrix Dc(*Grid_c, *RBGrid_c, hermitian); + Dc.CoarsenOperator(Grid_f, MdagMOp_Dwc, Aggs); + MdagMLinearOperator MdagMOp_Dc(Dc); + + ///////////////////////////////////////////////////////////////////////////// + // Setup vectors used in all tests // + ///////////////////////////////////////////////////////////////////////////// + + CoarseVector src(Grid_c); random(pRNG_c, src); + CoarseVector diff(Grid_c); diff = Zero(); + + ///////////////////////////////////////////////////////////////////////////// + // Start of tests // + ///////////////////////////////////////////////////////////////////////////// + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Dhop + Mdiag = Munprec" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi(Grid_c); chi = Zero(); + CoarseVector res(Grid_c); res = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + + Dc.Mdiag(src, phi); std::cout << GridLogMessage << "Applied Mdiag" << std::endl; + Dc.Dhop(src, chi, DaggerNo); std::cout << GridLogMessage << "Applied Dhop" << std::endl; + Dc.M(src, ref); std::cout << GridLogMessage << "Applied M" << std::endl; + + res = phi + chi; + + diff = ref - res; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Munprec), norm2(Dhop + Mdiag), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(res) << " " << absDev << " " << relDev << " -> check " + << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Meo + Moe = Dhop" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector src_e(RBGrid_c); src_e = Zero(); + CoarseVector src_o(RBGrid_c); src_o = Zero(); + CoarseVector res_e(RBGrid_c); res_e = Zero(); + CoarseVector res_o(RBGrid_c); res_o = Zero(); + CoarseVector res(Grid_c); res = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd, src_o, src); + + Dc.Meooe(src_e, res_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + Dc.Meooe(src_o, res_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.Dhop(src, ref, DaggerNo); std::cout << GridLogMessage << "Applied Dhop" << std::endl; + + setCheckerboard(res, res_o); + setCheckerboard(res, res_e); + + diff = ref - res; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Dhop), norm2(Meo + Moe), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(res) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test |(Im(v^dag M^dag M v)| = 0" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.M(src, tmp); std::cout << GridLogMessage << "Applied M" << std::endl; + Dc.Mdag(tmp, phi); std::cout << GridLogMessage << "Applied Mdag" << std::endl; + + std::cout << GridLogMessage << "src = " << norm2(src) << " tmp = " << norm2(tmp) << " phi = " << norm2(phi) << std::endl; + + ComplexD dot = innerProduct(src, phi); + + auto relDev = abs(imag(dot)) / abs(real(dot)); + std::cout << GridLogMessage << "Re(v^dag M^dag M v), Im(v^dag M^dag M v), rel.deviation: " + << real(dot) << " " << imag(dot) << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test |(Im(v^dag Mooee^dag Mooee v)| = 0 (full grid)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.Mooee(src, tmp); std::cout << GridLogMessage << "Applied Mooee" << std::endl; + Dc.MooeeDag(tmp, phi); std::cout << GridLogMessage << "Applied MooeeDag" << std::endl; + + ComplexD dot = innerProduct(src, phi); + + auto relDev = abs(imag(dot)) / abs(real(dot)); + std::cout << GridLogMessage << "Re(v^dag Mooee^dag Mooee v), Im(v^dag Mooee^dag Mooee v), rel.deviation: " + << real(dot) << " " << imag(dot) << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeInv Mooee = 1 (full grid)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.Mooee(src, tmp); std::cout << GridLogMessage << "Applied Mooee" << std::endl; + Dc.MooeeInv(tmp, phi); std::cout << GridLogMessage << "Applied MooeeInv" << std::endl; + + diff = src - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(src); + std::cout << GridLogMessage << "norm2(src), norm2(MooeeInv Mooee src), abs. deviation, rel. deviation: " + << norm2(src) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MeooeDagger is the dagger of Meooe by requiring" << std::endl; + std::cout << GridLogMessage << "= < phi | Meooe | chi > * = < chi | Meooe^dag| phi>" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + // clang-format off + CoarseVector phi(Grid_c); random(pRNG_c, phi); + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector dchi_e(RBGrid_c); dchi_e = Zero(); + CoarseVector dchi_o(RBGrid_c); dchi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector dphi_e(RBGrid_c); dphi_e = Zero(); + CoarseVector dphi_o(RBGrid_c); dphi_o = Zero(); + // clang-format on + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + + Dc.Meooe(chi_e, dchi_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + Dc.Meooe(chi_o, dchi_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.MeooeDag(phi_e, dphi_o); std::cout << GridLogMessage << "Applied MeoDag" << std::endl; + Dc.MeooeDag(phi_o, dphi_e); std::cout << GridLogMessage << "Applied MoeDag" << std::endl; + + ComplexD phiDchi_e = innerProduct(phi_e, dchi_e); + ComplexD phiDchi_o = innerProduct(phi_o, dchi_o); + ComplexD chiDphi_e = innerProduct(chi_e, dphi_e); + ComplexD chiDphi_o = innerProduct(chi_o, dphi_o); + + std::cout << GridLogDebug << "norm dchi_e = " << norm2(dchi_e) << " norm dchi_o = " << norm2(dchi_o) << " norm dphi_e = " << norm2(dphi_e) + << " norm dphi_o = " << norm2(dphi_e) << std::endl; + + std::cout << GridLogMessage << "e " << phiDchi_e << " " << chiDphi_e << std::endl; + std::cout << GridLogMessage << "o " << phiDchi_o << " " << chiDphi_o << std::endl; + + std::cout << GridLogMessage << "phiDchi_e - conj(chiDphi_o) " << phiDchi_e - conj(chiDphi_o) << std::endl; + std::cout << GridLogMessage << "phiDchi_o - conj(chiDphi_e) " << phiDchi_o - conj(chiDphi_e) << std::endl; + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeInv Mooee = 1 (checkerboards separately)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector tmp_e(RBGrid_c); tmp_e = Zero(); + CoarseVector tmp_o(RBGrid_c); tmp_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, tmp_e, tmp); + pickCheckerboard(Odd, tmp_o, tmp); + + Dc.Mooee(chi_e, tmp_e); std::cout << GridLogMessage << "Applied Mee" << std::endl; + Dc.MooeeInv(tmp_e, phi_e); std::cout << GridLogMessage << "Applied MeeInv" << std::endl; + Dc.Mooee(chi_o, tmp_o); std::cout << GridLogMessage << "Applied Moo" << std::endl; + Dc.MooeeInv(tmp_o, phi_o); std::cout << GridLogMessage << "Applied MooInv" << std::endl; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + diff = chi - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(chi); + std::cout << GridLogMessage << "norm2(chi), norm2(MeeInv Mee chi), abs. deviation, rel. deviation: " + << norm2(chi) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeDag MooeeInvDag = 1 (checkerboards separately)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector tmp_e(RBGrid_c); tmp_e = Zero(); + CoarseVector tmp_o(RBGrid_c); tmp_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, tmp_e, tmp); + pickCheckerboard(Odd, tmp_o, tmp); + + Dc.MooeeDag(chi_e, tmp_e); std::cout << GridLogMessage << "Applied MeeDag" << std::endl; + Dc.MooeeInvDag(tmp_e, phi_e); std::cout << GridLogMessage << "Applied MeeInvDag" << std::endl; + Dc.MooeeDag(chi_o, tmp_o); std::cout << GridLogMessage << "Applied MooDag" << std::endl; + Dc.MooeeInvDag(tmp_o, phi_o); std::cout << GridLogMessage << "Applied MooInvDag" << std::endl; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + diff = chi - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(chi); + std::cout << GridLogMessage << "norm2(chi), norm2(MeeDag MeeInvDag chi), abs. deviation, rel. deviation: " + << norm2(chi) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Meo + Moe + Moo + Mee = Munprec" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); chi = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + CoarseVector src_e(RBGrid_c); src_e = Zero(); + CoarseVector src_o(RBGrid_c); src_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd, src_o, src); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + + // M phi = (Mooee src_e + Meooe src_o , Mooee src_o + Meooe src_e) + + Dc.M(src, ref); // Reference result from the unpreconditioned operator + + // EO matrix + Dc.Mooee(src_e, chi_e); std::cout << GridLogMessage << "Applied Mee" << std::endl; + Dc.Mooee(src_o, chi_o); std::cout << GridLogMessage << "Applied Moo" << std::endl; + Dc.Meooe(src_o, phi_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.Meooe(src_e, phi_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + + phi_o += chi_o; + phi_e += chi_e; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + std::cout << GridLogDebug << "norm phi_e = " << norm2(phi_e) << " norm phi_o = " << norm2(phi_o) << " norm phi = " << norm2(phi) << std::endl; + + diff = ref - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Dunprec), norm2(Deoprec), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MpcDagMpc is hermitian" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector phi(Grid_c); random(pRNG_c, phi); + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector dchi_e(RBGrid_c); dchi_e = Zero(); + CoarseVector dchi_o(RBGrid_c); dchi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector dphi_e(RBGrid_c); dphi_e = Zero(); + CoarseVector dphi_o(RBGrid_c); dphi_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + + SchurDiagMooeeOperator HermOpEO(Dc); + + HermOpEO.MpcDagMpc(chi_e, dchi_e); std::cout << GridLogMessage << "Applied MpcDagMpc to chi_e" << std::endl; + HermOpEO.MpcDagMpc(chi_o, dchi_o); std::cout << GridLogMessage << "Applied MpcDagMpc to chi_o" << std::endl; + HermOpEO.MpcDagMpc(phi_e, dphi_e); std::cout << GridLogMessage << "Applied MpcDagMpc to phi_e" << std::endl; + HermOpEO.MpcDagMpc(phi_o, dphi_o); std::cout << GridLogMessage << "Applied MpcDagMpc to phi_o" << std::endl; + + ComplexD phiDchi_e = innerProduct(phi_e, dchi_e); + ComplexD phiDchi_o = innerProduct(phi_o, dchi_o); + ComplexD chiDphi_e = innerProduct(chi_e, dphi_e); + ComplexD chiDphi_o = innerProduct(chi_o, dphi_o); + + std::cout << GridLogMessage << "e " << phiDchi_e << " " << chiDphi_e << std::endl; + std::cout << GridLogMessage << "o " << phiDchi_o << " " << chiDphi_o << std::endl; + + std::cout << GridLogMessage << "phiDchi_e - conj(chiDphi_e) " << phiDchi_e - conj(chiDphi_e) << std::endl; + std::cout << GridLogMessage << "phiDchi_o - conj(chiDphi_o) " << phiDchi_o - conj(chiDphi_o) << std::endl; + } + + Grid_finalize(); +} From 01652d8cfea1ad463b037b20aabfc4f3b0dc7d37 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 13 Sep 2020 05:56:02 -0400 Subject: [PATCH 08/56] SlabAllocator --- Grid/threads/Accelerator.h | 13 +-- Grid/threads/SlabAllocator.cc | 161 ++++++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+), 11 deletions(-) create mode 100644 Grid/threads/SlabAllocator.cc diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 1a3dfdc2..89d45a17 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -153,18 +153,7 @@ inline void *acceleratorAllocShared(size_t bytes) } return ptr; }; -inline void *acceleratorAllocDevice(size_t bytes) -{ - void *ptr=NULL; - auto err = cudaMalloc((void **)&ptr,bytes); - if( err != cudaSuccess ) { - ptr = (void *) NULL; - printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); - } - return ptr; -}; inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; -inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} inline int acceleratorIsCommunicable(void *ptr) @@ -176,6 +165,8 @@ inline int acceleratorIsCommunicable(void *ptr) if(uvm) return 0; else return 1; } +void *acceleratorAllocDevice(size_t bytes); +void acceleratorFreeDevice(void* ptr); #endif diff --git a/Grid/threads/SlabAllocator.cc b/Grid/threads/SlabAllocator.cc new file mode 100644 index 00000000..da863687 --- /dev/null +++ b/Grid/threads/SlabAllocator.cc @@ -0,0 +1,161 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./Grid/threads/SlabAllocator.cc + + Copyright (C) 2020 + +Author: Christoph Lehner + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#include + +#include + +NAMESPACE_BEGIN(Grid); + +#ifdef GRID_CUDA + +#define GRID_DEVICE_HEAP_SLAB_THRESHOLD (1024*1024) +#define GRID_DEVICE_HEAP_SLAB_SIZE (2*1024*1024) + +void *acceleratorAllocDeviceCUDA(size_t bytes) { + void *ptr=NULL; + auto err = cudaMalloc((void **)&ptr,bytes); + if( err != cudaSuccess ) { + ptr = (void *) NULL; + printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); + } + return ptr; +} + +void acceleratorFreeDeviceCUDA(void *ptr) { + cudaFree(ptr); +} + +struct grid_device_heap_slab_t { + void* Ptr; + size_t ElementSize; + size_t Elements; + std::unordered_set Allocated; + std::unordered_set Available; +}; + +std::unordered_map DeviceHeapPtrTable; +std::unordered_map > DeviceHeapSlabTable; + +void* SlabAllocateElement(grid_device_heap_slab_t* slab) { + assert(!slab->Available.empty()); + auto available = slab->Available.begin(); + auto slot = *available; + slab->Allocated.insert(slot); + slab->Available.erase(available); + + void* Ptr = (void*)((char*)slab->Ptr + slot * slab->ElementSize); + DeviceHeapPtrTable[Ptr] = slab; + + //std::cout << "Allocate element " << slot << " of slab " << slab << " of size " << slab->ElementSize << " with elements " << slab->Elements << + // " (allocated = " << slab->Allocated.size() << ", available = " << slab->Available.size() << ")" << std::endl; + + return Ptr; +} + +void SlabRemove(grid_device_heap_slab_t* slab) { + auto & t = DeviceHeapSlabTable[slab->ElementSize]; + assert(slab->Ptr); + DeviceHeapPtrTable.erase(slab->Ptr); + acceleratorFreeDeviceCUDA(slab->Ptr); + assert(t.count(slab) == 1); + t.erase(slab); + delete slab; + //std::cout << "Remove slab " << slab << std::endl; +} + +void SlabFreeElement(grid_device_heap_slab_t* slab, void* ElementPtr) { + size_t Offset = (size_t)ElementPtr - (size_t)slab->Ptr; + //std::cout << "SlabFreeElement offset " << Offset << std::endl; + assert(Offset < GRID_DEVICE_HEAP_SLAB_SIZE); + assert(Offset % slab->ElementSize == 0); + size_t slot = Offset / slab->ElementSize; + assert(slot >= 0); + assert(slab->Allocated.count(slot) == 1 && slab->Available.count(slot) == 0); + slab->Allocated.erase(slot); + slab->Available.insert(slot); + + //std::cout << "Free element " << slot << " of slab" << slab << std::endl; + + if (slab->Allocated.empty()) { + SlabRemove(slab); + } +} + +grid_device_heap_slab_t* SlabFind(size_t bytes) { + + grid_device_heap_slab_t* slab = 0; + std::unordered_set* slab_set = 0; + + decltype(DeviceHeapSlabTable.begin()) slabs = DeviceHeapSlabTable.find(bytes); + if (slabs == DeviceHeapSlabTable.end()) { + slab_set = &DeviceHeapSlabTable[bytes]; + } else { + slab_set = &slabs->second; + } + + for (auto& s : *slab_set) { + if (!s->Available.empty()) { + slab = &(*s); + break; + } + } + + if (!slab) { + slab = new grid_device_heap_slab_t; + slab_set->insert(slab); + slab->Ptr = acceleratorAllocDeviceCUDA(GRID_DEVICE_HEAP_SLAB_SIZE); + slab->ElementSize = bytes; + slab->Elements = GRID_DEVICE_HEAP_SLAB_SIZE / bytes; + for (size_t i=0;iElements;i++) + slab->Available.insert(i); + //std::cout << "New slab" << slab << std::endl; + } + + return slab; +} + +void *acceleratorAllocDevice(size_t bytes) { + if (bytes >= GRID_DEVICE_HEAP_SLAB_THRESHOLD) { + return acceleratorAllocDeviceCUDA(bytes); + } + + return SlabAllocateElement(SlabFind(bytes)); +} + +void acceleratorFreeDevice(void *ptr) { + auto p = DeviceHeapPtrTable.find(ptr); + if (p == DeviceHeapPtrTable.end()) { + acceleratorFreeDeviceCUDA(ptr); + } else { + SlabFreeElement(p->second,ptr); + } +} + +#endif + +NAMESPACE_END(Grid); From 32ff766dbd8416b45ae042463c8f65145b75806f Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 13 Sep 2020 14:02:53 -0400 Subject: [PATCH 09/56] fix evict scheme, slab alloc --- Grid/allocator/MemoryManagerCache.cc | 6 ++++-- Grid/threads/SlabAllocator.cc | 8 ++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 5dd7575e..7d4581d7 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -227,12 +227,13 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,hint); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; + if (!AccCache.AccPtr) + EvictVictims(bytes); assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); @@ -361,12 +362,13 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,transient); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; + if (!AccCache.AccPtr) + EvictVictims(bytes); assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache.accLock==0); // Programming error diff --git a/Grid/threads/SlabAllocator.cc b/Grid/threads/SlabAllocator.cc index da863687..5590f835 100644 --- a/Grid/threads/SlabAllocator.cc +++ b/Grid/threads/SlabAllocator.cc @@ -36,6 +36,9 @@ NAMESPACE_BEGIN(Grid); #define GRID_DEVICE_HEAP_SLAB_THRESHOLD (1024*1024) #define GRID_DEVICE_HEAP_SLAB_SIZE (2*1024*1024) +size_t currentDeviceAlloc = 0; +std::unordered_map ptr_size; + void *acceleratorAllocDeviceCUDA(size_t bytes) { void *ptr=NULL; auto err = cudaMalloc((void **)&ptr,bytes); @@ -43,11 +46,16 @@ void *acceleratorAllocDeviceCUDA(size_t bytes) { ptr = (void *) NULL; printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); } + currentDeviceAlloc += bytes; + ptr_size[ptr] = bytes; + std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl; return ptr; } void acceleratorFreeDeviceCUDA(void *ptr) { cudaFree(ptr); + currentDeviceAlloc -= ptr_size[ptr]; + std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl; } struct grid_device_heap_slab_t { From d50a2164d73d2c01048cd1f25f765a720472b2c6 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 13 Sep 2020 14:06:06 -0400 Subject: [PATCH 10/56] remove slab allocator --- Grid/threads/Accelerator.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 89d45a17..1a3dfdc2 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -153,7 +153,18 @@ inline void *acceleratorAllocShared(size_t bytes) } return ptr; }; +inline void *acceleratorAllocDevice(size_t bytes) +{ + void *ptr=NULL; + auto err = cudaMalloc((void **)&ptr,bytes); + if( err != cudaSuccess ) { + ptr = (void *) NULL; + printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); + } + return ptr; +}; inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; +inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} inline int acceleratorIsCommunicable(void *ptr) @@ -165,8 +176,6 @@ inline int acceleratorIsCommunicable(void *ptr) if(uvm) return 0; else return 1; } -void *acceleratorAllocDevice(size_t bytes); -void acceleratorFreeDevice(void* ptr); #endif From 5cffa05c7e2965216200e7fff3183fce3f15c8bb Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 13 Sep 2020 14:06:25 -0400 Subject: [PATCH 11/56] remove slab allocator file --- Grid/threads/SlabAllocator.cc | 169 ---------------------------------- 1 file changed, 169 deletions(-) delete mode 100644 Grid/threads/SlabAllocator.cc diff --git a/Grid/threads/SlabAllocator.cc b/Grid/threads/SlabAllocator.cc deleted file mode 100644 index 5590f835..00000000 --- a/Grid/threads/SlabAllocator.cc +++ /dev/null @@ -1,169 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./Grid/threads/SlabAllocator.cc - - Copyright (C) 2020 - -Author: Christoph Lehner - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#include - -#include - -NAMESPACE_BEGIN(Grid); - -#ifdef GRID_CUDA - -#define GRID_DEVICE_HEAP_SLAB_THRESHOLD (1024*1024) -#define GRID_DEVICE_HEAP_SLAB_SIZE (2*1024*1024) - -size_t currentDeviceAlloc = 0; -std::unordered_map ptr_size; - -void *acceleratorAllocDeviceCUDA(size_t bytes) { - void *ptr=NULL; - auto err = cudaMalloc((void **)&ptr,bytes); - if( err != cudaSuccess ) { - ptr = (void *) NULL; - printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); - } - currentDeviceAlloc += bytes; - ptr_size[ptr] = bytes; - std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl; - return ptr; -} - -void acceleratorFreeDeviceCUDA(void *ptr) { - cudaFree(ptr); - currentDeviceAlloc -= ptr_size[ptr]; - std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl; -} - -struct grid_device_heap_slab_t { - void* Ptr; - size_t ElementSize; - size_t Elements; - std::unordered_set Allocated; - std::unordered_set Available; -}; - -std::unordered_map DeviceHeapPtrTable; -std::unordered_map > DeviceHeapSlabTable; - -void* SlabAllocateElement(grid_device_heap_slab_t* slab) { - assert(!slab->Available.empty()); - auto available = slab->Available.begin(); - auto slot = *available; - slab->Allocated.insert(slot); - slab->Available.erase(available); - - void* Ptr = (void*)((char*)slab->Ptr + slot * slab->ElementSize); - DeviceHeapPtrTable[Ptr] = slab; - - //std::cout << "Allocate element " << slot << " of slab " << slab << " of size " << slab->ElementSize << " with elements " << slab->Elements << - // " (allocated = " << slab->Allocated.size() << ", available = " << slab->Available.size() << ")" << std::endl; - - return Ptr; -} - -void SlabRemove(grid_device_heap_slab_t* slab) { - auto & t = DeviceHeapSlabTable[slab->ElementSize]; - assert(slab->Ptr); - DeviceHeapPtrTable.erase(slab->Ptr); - acceleratorFreeDeviceCUDA(slab->Ptr); - assert(t.count(slab) == 1); - t.erase(slab); - delete slab; - //std::cout << "Remove slab " << slab << std::endl; -} - -void SlabFreeElement(grid_device_heap_slab_t* slab, void* ElementPtr) { - size_t Offset = (size_t)ElementPtr - (size_t)slab->Ptr; - //std::cout << "SlabFreeElement offset " << Offset << std::endl; - assert(Offset < GRID_DEVICE_HEAP_SLAB_SIZE); - assert(Offset % slab->ElementSize == 0); - size_t slot = Offset / slab->ElementSize; - assert(slot >= 0); - assert(slab->Allocated.count(slot) == 1 && slab->Available.count(slot) == 0); - slab->Allocated.erase(slot); - slab->Available.insert(slot); - - //std::cout << "Free element " << slot << " of slab" << slab << std::endl; - - if (slab->Allocated.empty()) { - SlabRemove(slab); - } -} - -grid_device_heap_slab_t* SlabFind(size_t bytes) { - - grid_device_heap_slab_t* slab = 0; - std::unordered_set* slab_set = 0; - - decltype(DeviceHeapSlabTable.begin()) slabs = DeviceHeapSlabTable.find(bytes); - if (slabs == DeviceHeapSlabTable.end()) { - slab_set = &DeviceHeapSlabTable[bytes]; - } else { - slab_set = &slabs->second; - } - - for (auto& s : *slab_set) { - if (!s->Available.empty()) { - slab = &(*s); - break; - } - } - - if (!slab) { - slab = new grid_device_heap_slab_t; - slab_set->insert(slab); - slab->Ptr = acceleratorAllocDeviceCUDA(GRID_DEVICE_HEAP_SLAB_SIZE); - slab->ElementSize = bytes; - slab->Elements = GRID_DEVICE_HEAP_SLAB_SIZE / bytes; - for (size_t i=0;iElements;i++) - slab->Available.insert(i); - //std::cout << "New slab" << slab << std::endl; - } - - return slab; -} - -void *acceleratorAllocDevice(size_t bytes) { - if (bytes >= GRID_DEVICE_HEAP_SLAB_THRESHOLD) { - return acceleratorAllocDeviceCUDA(bytes); - } - - return SlabAllocateElement(SlabFind(bytes)); -} - -void acceleratorFreeDevice(void *ptr) { - auto p = DeviceHeapPtrTable.find(ptr); - if (p == DeviceHeapPtrTable.end()) { - acceleratorFreeDeviceCUDA(ptr); - } else { - SlabFreeElement(p->second,ptr); - } -} - -#endif - -NAMESPACE_END(Grid); From d060341168ab9082649f813fc3727fb65f08d3cd Mon Sep 17 00:00:00 2001 From: KANAMORI Issaku Date: Fri, 16 Oct 2020 21:39:17 +0900 Subject: [PATCH 12/56] add an error check for Parameters.StartingType --- Grid/qcd/hmc/GenericHMCrunner.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Grid/qcd/hmc/GenericHMCrunner.h b/Grid/qcd/hmc/GenericHMCrunner.h index c2443dd0..98e8175a 100644 --- a/Grid/qcd/hmc/GenericHMCrunner.h +++ b/Grid/qcd/hmc/GenericHMCrunner.h @@ -159,6 +159,13 @@ private: Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U, Resources.GetSerialRNG(), Resources.GetParallelRNG()); + } else { + // others + std::cout << GridLogError << "Unrecognized StartingType\n"; + std::cout + << GridLogError + << "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + exit(1); } Smearing.set_Field(U); From 463d72d322581a256369e9c42cdb30ce0e5595fc Mon Sep 17 00:00:00 2001 From: Raoul Hodgson Date: Mon, 19 Oct 2020 16:13:28 +0100 Subject: [PATCH 13/56] Added untraced baryon contraction code --- Grid/qcd/utils/BaryonUtils.h | 231 +++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index b268b684..beab3436 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -61,6 +61,16 @@ public: const int parity, const bool * wick_contractions, robj &result); + template + static void baryon_site_matrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool * wick_contractions, + robj &result); public: static void Wick_Contractions(std::string qi, std::string qf, @@ -75,6 +85,15 @@ public: const bool* wick_contractions, const int parity, ComplexField &baryon_corr); + static void ContractBaryons_matrix(const PropagatorField &q1_left, + const PropagatorField &q2_left, + const PropagatorField &q3_left, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + SpinMatrixField &baryon_corr); template static void ContractBaryons_Sliced(const mobj &D1, const mobj &D2, @@ -87,6 +106,17 @@ public: const int parity, const int nt, robj &result); + template + static void ContractBaryons_Sliced_matrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + const int nt, + robj &result); private: template static void Baryon_Gamma_3pt_Group1_Site( @@ -329,6 +359,126 @@ void BaryonUtils::baryon_site(const mobj &D1, }} } +//New version without parity projection or trace +template +template +void BaryonUtils::baryon_site_matrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_i, + const Gamma GammaB_i, + const Gamma GammaA_f, + const Gamma GammaB_f, + const bool * wick_contraction, + robj &result) +{ + + auto D1_GAi = D1 * GammaA_i; + auto GAf_D1_GAi = GammaA_f * D1_GAi; + auto GBf_D1_GAi = GammaB_f * D1_GAi; + + auto D2_GBi = D2 * GammaB_i; + auto GBf_D2_GBi = GammaB_f * D2_GBi; + auto GAf_D2_GBi = GammaA_f * D2_GBi; + + auto GBf_D3 = GammaB_f * D3; + auto GAf_D3 = GammaA_f * D3; + + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = epsilon[ie_f][0]; //a + int b_f = epsilon[ie_f][1]; //b + int c_f = epsilon[ie_f][2]; //c + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = epsilon[ie_i][0]; //a' + int b_i = epsilon[ie_i][1]; //b' + int c_i = epsilon[ie_i][2]; //c' + + Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + //This is the \delta_{456}^{123} part + if (wick_contraction[0]){ + for (int rho_i=0; rho_i::ContractBaryons(const PropagatorField &q1_left, t += usecond(); std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; +} + +template +void BaryonUtils::ContractBaryons_matrix(const PropagatorField &q1_left, + const PropagatorField &q2_left, + const PropagatorField &q3_left, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + SpinMatrixField &baryon_corr) +{ + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; + std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; + std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; + std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; + + GridBase *grid = q1_left.Grid(); + + autoView(vbaryon_corr, baryon_corr,CpuWrite); + autoView( v1 , q1_left, CpuRead); + autoView( v2 , q2_left, CpuRead); + autoView( v3 , q3_left, CpuRead); + + // Real bytes =0.; + // bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real)); + // for (int ie=0; ie < 6 ; ie++){ + // if(ie==0 or ie==3){ + // bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie]; + // } + // else{ + // bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie]; + // } + // } + // Real t=0.; + // t =-usecond(); + + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto D1 = v1[ss]; + auto D2 = v2[ss]; + auto D3 = v3[ss]; + sobj result=Zero(); + baryon_site_matrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result); + vbaryon_corr[ss] = result; + } );//end loop over lattice sites + + // t += usecond(); + + // std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; } @@ -442,6 +646,33 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, } } +template +template +void BaryonUtils::ContractBaryons_Sliced_matrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + const int nt, + robj &result) +{ + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; + std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; + std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; + std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; + + for (int t=0; t Date: Wed, 21 Oct 2020 11:58:53 +0100 Subject: [PATCH 14/56] BaryonUtils function naming change --- Grid/qcd/utils/BaryonUtils.h | 86 ++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index beab3436..1259225a 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -51,7 +51,7 @@ public: private: template - static void baryon_site(const mobj &D1, + static void BaryonSite(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -62,7 +62,7 @@ public: const bool * wick_contractions, robj &result); template - static void baryon_site_matrix(const mobj &D1, + static void BaryonSiteMatrix(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -72,7 +72,7 @@ public: const bool * wick_contractions, robj &result); public: - static void Wick_Contractions(std::string qi, + static void WickContractions(std::string qi, std::string qf, bool* wick_contractions); static void ContractBaryons(const PropagatorField &q1_left, @@ -85,7 +85,7 @@ public: const bool* wick_contractions, const int parity, ComplexField &baryon_corr); - static void ContractBaryons_matrix(const PropagatorField &q1_left, + static void ContractBaryonsMatrix(const PropagatorField &q1_left, const PropagatorField &q2_left, const PropagatorField &q3_left, const Gamma GammaA_left, @@ -95,7 +95,7 @@ public: const bool* wick_contractions, SpinMatrixField &baryon_corr); template - static void ContractBaryons_Sliced(const mobj &D1, + static void ContractBaryonsSliced(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -107,7 +107,7 @@ public: const int nt, robj &result); template - static void ContractBaryons_Sliced_matrix(const mobj &D1, + static void ContractBaryonsSlicedMatrix(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -119,7 +119,7 @@ public: robj &result); private: template - static void Baryon_Gamma_3pt_Group1_Site( + static void BaryonGamma3ptGroup1Site( const mobj &Dq1_ti, const mobj2 &Dq2_spec, const mobj2 &Dq3_spec, @@ -131,7 +131,7 @@ public: robj &result); template - static void Baryon_Gamma_3pt_Group2_Site( + static void BaryonGamma3ptGroup2Site( const mobj2 &Dq1_spec, const mobj &Dq2_ti, const mobj2 &Dq3_spec, @@ -143,7 +143,7 @@ public: robj &result); template - static void Baryon_Gamma_3pt_Group3_Site( + static void BaryonGamma3ptGroup3Site( const mobj2 &Dq1_spec, const mobj2 &Dq2_spec, const mobj &Dq3_ti, @@ -155,7 +155,7 @@ public: robj &result); public: template - static void Baryon_Gamma_3pt( + static void BaryonGamma3pt( const PropagatorField &q_ti, const mobj &Dq_spec1, const mobj &Dq_spec2, @@ -168,7 +168,7 @@ public: SpinMatrixField &stn_corr); private: template - static void Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, + static void SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, @@ -177,7 +177,7 @@ public: const Gamma GammaB_nucl, robj &result); template - static void Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti, + static void SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, const mobj &Du_tf, const mobj2 &Du_spec, const mobj &Dd_tf, @@ -189,7 +189,7 @@ public: template - static void Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop, + static void SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, @@ -198,7 +198,7 @@ public: const Gamma GammaB_nucl, robj &result); template - static void Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti, + static void SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, const mobj &Du_tf, const mobj2 &Du_spec, const mobj &Dd_tf, @@ -209,7 +209,7 @@ public: robj &result); public: template - static void Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, + static void SigmaToNucleonEye(const PropagatorField &qq_loop, const mobj &Du_spec, const PropagatorField &qd_tf, const PropagatorField &qs_ti, @@ -219,7 +219,7 @@ public: const std::string op, SpinMatrixField &stn_corr); template - static void Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, + static void SigmaToNucleonNonEye(const PropagatorField &qq_ti, const PropagatorField &qq_tf, const mobj &Du_spec, const PropagatorField &qd_tf, @@ -247,7 +247,7 @@ const Real BaryonUtils::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.}; //This is the old version template template -void BaryonUtils::baryon_site(const mobj &D1, +void BaryonUtils::BaryonSite(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_i, @@ -362,7 +362,7 @@ void BaryonUtils::baryon_site(const mobj &D1, //New version without parity projection or trace template template -void BaryonUtils::baryon_site_matrix(const mobj &D1, +void BaryonUtils::BaryonSiteMatrix(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_i, @@ -484,7 +484,7 @@ void BaryonUtils::baryon_site_matrix(const mobj &D1, * flavours. * * The array wick_contractions must be of length 6 */ template -void BaryonUtils::Wick_Contractions(std::string qi, std::string qf, bool* wick_contractions) { +void BaryonUtils::WickContractions(std::string qi, std::string qf, bool* wick_contractions) { const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; for (int ie=0; ie < 6 ; ie++) { wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3 @@ -547,7 +547,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, auto D2 = v2[ss]; auto D3 = v3[ss]; vobj result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result); + BaryonSite(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result); vbaryon_corr[ss] = result; } );//end loop over lattice sites @@ -557,7 +557,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, } template -void BaryonUtils::ContractBaryons_matrix(const PropagatorField &q1_left, +void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, const PropagatorField &q2_left, const PropagatorField &q3_left, const Gamma GammaA_left, @@ -601,7 +601,7 @@ void BaryonUtils::ContractBaryons_matrix(const PropagatorField &q1_left, auto D2 = v2[ss]; auto D3 = v3[ss]; sobj result=Zero(); - baryon_site_matrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result); + BaryonSiteMatrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result); vbaryon_corr[ss] = result; } );//end loop over lattice sites @@ -618,7 +618,7 @@ void BaryonUtils::ContractBaryons_matrix(const PropagatorField &q1_left, * Wick_Contractions function above */ template template -void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, +void BaryonUtils::ContractBaryonsSliced(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -642,13 +642,13 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); for (int t=0; t template -void BaryonUtils::ContractBaryons_Sliced_matrix(const mobj &D1, +void BaryonUtils::ContractBaryonsSlicedMatrix(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -669,7 +669,7 @@ void BaryonUtils::ContractBaryons_Sliced_matrix(const mobj &D1, std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; for (int t=0; t::ContractBaryons_Sliced_matrix(const mobj &D1, * Dq4_tf is a quark line from t_f to t_J */ template template -void BaryonUtils::Baryon_Gamma_3pt_Group1_Site( +void BaryonUtils::BaryonGamma3ptGroup1Site( const mobj &Dq1_ti, const mobj2 &Dq2_spec, const mobj2 &Dq3_spec, @@ -777,7 +777,7 @@ void BaryonUtils::Baryon_Gamma_3pt_Group1_Site( * Dq4_tf is a quark line from t_f to t_J */ template template -void BaryonUtils::Baryon_Gamma_3pt_Group2_Site( +void BaryonUtils::BaryonGamma3ptGroup2Site( const mobj2 &Dq1_spec, const mobj &Dq2_ti, const mobj2 &Dq3_spec, @@ -867,7 +867,7 @@ void BaryonUtils::Baryon_Gamma_3pt_Group2_Site( * Dq4_tf is a quark line from t_f to t_J */ template template -void BaryonUtils::Baryon_Gamma_3pt_Group3_Site( +void BaryonUtils::BaryonGamma3ptGroup3Site( const mobj2 &Dq1_spec, const mobj2 &Dq2_spec, const mobj &Dq3_ti, @@ -959,7 +959,7 @@ void BaryonUtils::Baryon_Gamma_3pt_Group3_Site( * https://aportelli.github.io/Hadrons-doc/#/mcontraction */ template template -void BaryonUtils::Baryon_Gamma_3pt( +void BaryonUtils::BaryonGamma3pt( const PropagatorField &q_ti, const mobj &Dq_spec1, const mobj &Dq_spec2, @@ -982,7 +982,7 @@ void BaryonUtils::Baryon_Gamma_3pt( auto Dq_ti = vq_ti[ss]; auto Dq_tf = vq_tf[ss]; sobj result=Zero(); - Baryon_Gamma_3pt_Group1_Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); vcorr[ss] += result; });//end loop over lattice sites } else if (group == 2) { @@ -990,7 +990,7 @@ void BaryonUtils::Baryon_Gamma_3pt( auto Dq_ti = vq_ti[ss]; auto Dq_tf = vq_tf[ss]; sobj result=Zero(); - Baryon_Gamma_3pt_Group2_Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); vcorr[ss] += result; });//end loop over lattice sites } else if (group == 3) { @@ -998,7 +998,7 @@ void BaryonUtils::Baryon_Gamma_3pt( auto Dq_ti = vq_ti[ss]; auto Dq_tf = vq_tf[ss]; sobj result=Zero(); - Baryon_Gamma_3pt_Group3_Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + BaryonGamma3ptGroup3Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); vcorr[ss] += result; });//end loop over lattice sites @@ -1018,7 +1018,7 @@ void BaryonUtils::Baryon_Gamma_3pt( * Ds_ti is a quark line from t_i to t_H */ template template -void BaryonUtils::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, +void BaryonUtils::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, @@ -1069,7 +1069,7 @@ void BaryonUtils::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, * Ds_ti is a quark line from t_i to t_H */ template template -void BaryonUtils::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti, +void BaryonUtils::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, const mobj &Du_tf, const mobj2 &Du_spec, const mobj &Dd_tf, @@ -1128,7 +1128,7 @@ void BaryonUtils::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti, * Ds_ti is a quark line from t_i to t_H */ template template -void BaryonUtils::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop, +void BaryonUtils::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, @@ -1179,7 +1179,7 @@ void BaryonUtils::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop, * Ds_ti is a quark line from t_i to t_H */ template template -void BaryonUtils::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti, +void BaryonUtils::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, const mobj &Du_tf, const mobj2 &Du_spec, const mobj &Dd_tf, @@ -1233,7 +1233,7 @@ void BaryonUtils::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti, template template -void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, +void BaryonUtils::SigmaToNucleonEye(const PropagatorField &qq_loop, const mobj &Du_spec, const PropagatorField &qd_tf, const PropagatorField &qs_ti, @@ -1260,9 +1260,9 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, auto Ds_ti = vs_ti[ss]; sobj result=Zero(); if(op == "Q1"){ - Sigma_to_Nucleon_Q1_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ1EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else if(op == "Q2"){ - Sigma_to_Nucleon_Q2_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ2EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else { assert(0 && "Weak Operator not correctly specified"); } @@ -1272,7 +1272,7 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, template template -void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, +void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, const PropagatorField &qq_tf, const mobj &Du_spec, const PropagatorField &qd_tf, @@ -1302,9 +1302,9 @@ void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, auto Ds_ti = vs_ti[ss]; sobj result=Zero(); if(op == "Q1"){ - Sigma_to_Nucleon_Q1_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ1NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else if(op == "Q2"){ - Sigma_to_Nucleon_Q2_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ2NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else { assert(0 && "Weak Operator not correctly specified"); } From 52d17987dc3e09f537fa8d7d3a9403712e3d62c5 Mon Sep 17 00:00:00 2001 From: Raoul Hodgson Date: Fri, 23 Oct 2020 11:41:08 +0100 Subject: [PATCH 15/56] BaryonUtils.h updated debug output --- Grid/qcd/utils/BaryonUtils.h | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 1259225a..15516b56 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -514,11 +514,6 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); @@ -553,7 +548,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, t += usecond(); - std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; + std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; } template @@ -570,11 +565,6 @@ void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; GridBase *grid = q1_left.Grid(); @@ -607,7 +597,7 @@ void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, // t += usecond(); - // std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; + // std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; } @@ -633,11 +623,6 @@ void BaryonUtils::ContractBaryonsSliced(const mobj &D1, assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); @@ -663,11 +648,6 @@ void BaryonUtils::ContractBaryonsSlicedMatrix(const mobj &D1, assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; - for (int t=0; t Date: Tue, 20 Oct 2020 10:11:43 +0200 Subject: [PATCH 16/56] Thread inversion of clover term --- .../WilsonCloverFermionImplementation.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h index e721c20d..3032a80c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h @@ -92,20 +92,16 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) int lvol = _Umu.Grid()->lSites(); int DimRep = Impl::Dimension; - Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); - Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); - - Coordinate lcoor; - typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); - { autoView(CTv,CloverTerm,CpuRead); autoView(CTIv,CloverTermInv,CpuWrite); - for (int site = 0; site < lvol; site++) { + thread_for(site, lvol, { + Coordinate lcoor; grid->LocalIndexToLocalCoor(site, lcoor); - EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); peekLocalSite(Qx, CTv, lcoor); - Qxinv = Zero(); //if (csw!=0){ for (int j = 0; j < Ns; j++) for (int k = 0; k < Ns; k++) @@ -126,7 +122,7 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) // if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl; // } pokeLocalSite(Qxinv, CTIv, lcoor); - } + }); } // Separate the even and odd parts From f313565a3cdc9e9487921f7c4d33c14c1292082b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 31 Oct 2020 12:12:40 +0000 Subject: [PATCH 17/56] HiP compile --- Grid/serialisation/JSON_IO.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/serialisation/JSON_IO.cc b/Grid/serialisation/JSON_IO.cc index aca8bab3..f2282099 100644 --- a/Grid/serialisation/JSON_IO.cc +++ b/Grid/serialisation/JSON_IO.cc @@ -26,7 +26,7 @@ *************************************************************************************/ /* END LEGAL */ #include -#ifndef __NVCC__ +#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) NAMESPACE_BEGIN(Grid); From d10422ded88d34a6e72c16c31cfc74dd4805ca5b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 31 Oct 2020 18:12:30 -0400 Subject: [PATCH 18/56] Test project on group --- tests/core/Test_unary.cc | 106 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 tests/core/Test_unary.cc diff --git a/tests/core/Test_unary.cc b/tests/core/Test_unary.cc new file mode 100644 index 00000000..2ad6ba7b --- /dev/null +++ b/tests/core/Test_unary.cc @@ -0,0 +1,106 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_quenched_update.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt({8,8,8,8}); + GridCartesian * grid = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexD::Nsimd()), + GridDefaultMpi()); + + GridCartesian * gridF = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexF::Nsimd()), + GridDefaultMpi()); + + + /////////////////////////////// + // Configuration of known size + /////////////////////////////// + LatticeColourMatrixD ident(grid); + LatticeColourMatrixD U(grid); + LatticeColourMatrixD tmp(grid); + LatticeColourMatrixD org(grid); + LatticeColourMatrixF UF(gridF); + + LatticeGaugeField Umu(grid); + + ident =1.0; + + // RNG set up for test + std::vector pseeds({1,2,3,4,5}); // once I caught a fish alive + std::vector sseeds({6,7,8,9,10});// then i let it go again + GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds); + + SU::HotConfiguration(pRNG,Umu); + + U = PeekIndex(Umu,0); + org=U; + + + tmp= U*adj(U) - ident ; + RealD Def1 = norm2( tmp ); + std::cout << " Defect1 "< Date: Sat, 31 Oct 2020 18:12:47 -0400 Subject: [PATCH 19/56] Project on group fix on GPU tracked to reciprocal sqrt collision between CUDA and Grid rsqrt --- Grid/lattice/Lattice_ET.h | 2 -- Grid/simd/Grid_vector_unops.h | 8 -------- Grid/tensors/Tensor_Ta.h | 7 ++++++- Grid/tensors/Tensor_unary.h | 1 - 4 files changed, 6 insertions(+), 12 deletions(-) diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index f828ef30..4a8a7423 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -350,7 +350,6 @@ GridUnopClass(UnaryTimesI, timesI(a)); GridUnopClass(UnaryTimesMinusI, timesMinusI(a)); GridUnopClass(UnaryAbs, abs(a)); GridUnopClass(UnarySqrt, sqrt(a)); -GridUnopClass(UnaryRsqrt, rsqrt(a)); GridUnopClass(UnarySin, sin(a)); GridUnopClass(UnaryCos, cos(a)); GridUnopClass(UnaryAsin, asin(a)); @@ -463,7 +462,6 @@ GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI); GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the // abs-fabs-dabs-labs thing GRID_DEF_UNOP(sqrt, UnarySqrt); -GRID_DEF_UNOP(rsqrt, UnaryRsqrt); GRID_DEF_UNOP(sin, UnarySin); GRID_DEF_UNOP(cos, UnaryCos); GRID_DEF_UNOP(asin, UnaryAsin); diff --git a/Grid/simd/Grid_vector_unops.h b/Grid/simd/Grid_vector_unops.h index d225699b..b89bb785 100644 --- a/Grid/simd/Grid_vector_unops.h +++ b/Grid/simd/Grid_vector_unops.h @@ -125,14 +125,6 @@ accelerator_inline Grid_simd sqrt(const Grid_simd &r) { return SimdApply(SqrtRealFunctor(), r); } template -accelerator_inline Grid_simd rsqrt(const Grid_simd &r) { - return SimdApply(RSqrtRealFunctor(), r); -} -template -accelerator_inline Scalar rsqrt(const Scalar &r) { - return (RSqrtRealFunctor(), r); -} -template accelerator_inline Grid_simd cos(const Grid_simd &r) { return SimdApply(CosRealFunctor(), r); } diff --git a/Grid/tensors/Tensor_Ta.h b/Grid/tensors/Tensor_Ta.h index 1ef9fc23..f7af85b7 100644 --- a/Grid/tensors/Tensor_Ta.h +++ b/Grid/tensors/Tensor_Ta.h @@ -92,17 +92,22 @@ accelerator_inline iMatrix ProjectOnGroup(const iMatrix &arg) { // need a check for the group type? iMatrix ret(arg); + vtype rnrm; vtype nrm; vtype inner; for(int c1=0;c1 Date: Sat, 31 Oct 2020 18:14:31 -0400 Subject: [PATCH 20/56] Hip Free managed --- Grid/threads/Accelerator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index da8a85b0..d1a96266 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -328,7 +328,7 @@ inline void *acceleratorAllocDevice(size_t bytes) return ptr; }; -inline void acceleratorFreeShared(void *ptr){ free(ptr);}; +inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} From 5eeabaa2bb3f65f911817a4783fc43a45180baa5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 1 Nov 2020 01:16:01 +0000 Subject: [PATCH 21/56] HIP fix --- Grid/lattice/Lattice_basis.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index af9d7280..95f55d10 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) basis_v.push_back(basis[k].View(AcceleratorWrite)); } -#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) ) +#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) ) int max_threads = thread_max(); Vector < vobj > Bt(Nm * max_threads); thread_region @@ -161,11 +161,12 @@ void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,in double * Qt_j = & Qt_jv[0]; for(int k=0;koSites(),vobj::Nsimd(),{ auto B=coalescedRead(zz); for(int k=k0; k Date: Sat, 7 Nov 2020 13:32:16 +0100 Subject: [PATCH 22/56] Volume divisible guarantee --- benchmarks/Benchmark_ITT.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index eb275728..538366d7 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -334,8 +334,9 @@ public: int threads = GridThread::GetThreads(); Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); - GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi()); uint64_t NP = TmpGrid->RankCount(); @@ -343,7 +344,6 @@ public: NN_global=NN; uint64_t SHM=NP/NN; - Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); ///////// Welcome message //////////// std::cout< Date: Thu, 12 Nov 2020 20:29:58 +0100 Subject: [PATCH 23/56] Host memory explict --- benchmarks/Benchmark_comms.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 232030c8..ccffb564 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -94,8 +94,8 @@ int main (int argc, char ** argv) RealD Nnode = Grid.NodeCount(); RealD ppn = Nrank/Nnode; - std::vector > xbuf(8); - std::vector > rbuf(8); + std::vector > xbuf(8); + std::vector > rbuf(8); for(int mu=0;mu<8;mu++){ xbuf[mu].resize(lat*lat*lat*Ls); From 50b808ab33af9d12d47189f723802c6c31a9aa69 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 12 Nov 2020 22:28:12 +0100 Subject: [PATCH 24/56] Configure option between host and device --- Grid/cshift/Cshift_common.h | 88 +++++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 9 deletions(-) diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index b0dd068d..f2f39815 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -35,7 +35,7 @@ extern Vector > Cshift_table; // Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// template void -Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimension,int plane,int cbmask, int off=0) +Gather_plane_simple (const Lattice &rhs,cshiftVector &buffer,int dimension,int plane,int cbmask, int off=0) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen } } { - autoView(rhs_v , rhs, AcceleratorRead); auto buffer_p = & buffer[0]; auto table = &Cshift_table[0]; +#ifdef ACCELERATOR_CSHIFT + autoView(rhs_v , rhs, AcceleratorRead); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); }); +#else + autoView(rhs_v , rhs, CpuRead); + thread_for(i,ent,{ + buffer_p[table[i].first]=rhs_v[table[i].second]; + }); +#endif } } @@ -103,6 +110,7 @@ Gather_plane_extract(const Lattice &rhs, int n1=rhs.Grid()->_slice_stride[dimension]; if ( cbmask ==0x3){ +#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); accelerator_for2d(n,e1,b,e2,1,{ int o = n*n1; @@ -111,12 +119,22 @@ Gather_plane_extract(const Lattice &rhs, vobj temp =rhs_v[so+o+b]; extract(temp,pointers,offset); }); +#else + autoView(rhs_v , rhs, CpuRead); + thread_for2d(n,e1,b,e2,{ + int o = n*n1; + int offset = b+n*e2; + + vobj temp =rhs_v[so+o+b]; + extract(temp,pointers,offset); + }); +#endif } else { - autoView(rhs_v , rhs, AcceleratorRead); - Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate cdm =rhs.Grid()->_checker_dim_mask; std::cout << " Dense packed buffer WARNING " < &rhs, extract(temp,pointers,offset); } }); +#else + autoView(rhs_v , rhs, CpuRead); + thread_for2d(n,e1,b,e2,{ + + Coordinate coor; + + int o=n*n1; + int oindex = o+b; + + int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm); + + int ocb=1<(temp,pointers,offset); + } + }); +#endif } } ////////////////////////////////////////////////////// // Scatter for when there is no need to SIMD split ////////////////////////////////////////////////////// -template void Scatter_plane_simple (Lattice &rhs,commVector &buffer, int dimension,int plane,int cbmask) +template void Scatter_plane_simple (Lattice &rhs,cshiftVector &buffer, int dimension,int plane,int cbmask) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -182,12 +220,19 @@ template void Scatter_plane_simple (Lattice &rhs,commVector void Scatter_plane_merge(Lattice &rhs,ExtractPointerA int e2=rhs.Grid()->_slice_block[dimension]; if(cbmask ==0x3 ) { - autoView( rhs_v , rhs, AcceleratorWrite); int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension]; +#ifdef ACCELERATOR_CSHIFT + autoView( rhs_v , rhs, AcceleratorWrite); accelerator_for2d(n,e1,b,e2,1,{ int o = n*_slice_stride; int offset = b+n*_slice_block; merge(rhs_v[so+o+b],pointers,offset); }); +#else + autoView( rhs_v , rhs, CpuWrite); + thread_for2d(n,e1,b,e2,{ + int o = n*_slice_stride; + int offset = b+n*_slice_block; + merge(rhs_v[so+o+b],pointers,offset); + }); +#endif } else { // Case of SIMD split AND checker dim cannot currently be hit, except in @@ -280,12 +334,20 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs } { + auto table = &Cshift_table[0]; +#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); autoView(lhs_v , lhs, AcceleratorWrite); - auto table = &Cshift_table[0]; accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); }); +#else + autoView(rhs_v , rhs, CpuRead); + autoView(lhs_v , lhs, CpuWrite); + thread_for(i,ent,{ + lhs_v[table[i].first]=rhs_v[table[i].second]; + }); +#endif } } @@ -324,12 +386,20 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice Date: Thu, 12 Nov 2020 22:54:27 +0100 Subject: [PATCH 25/56] Option for bounce through the SHM buffer --- Grid/cshift/Cshift_mpi.h | 223 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 213 insertions(+), 10 deletions(-) diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 0f0e80b1..375d004e 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -101,7 +101,8 @@ template void Cshift_comms_simd(Lattice& ret,const Lattice void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) { typedef typename vobj::vector_type vector_type; @@ -121,9 +122,9 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - commVector send_buf(buffer_size); - commVector recv_buf(buffer_size); - + cshiftVector send_buf(buffer_size); + cshiftVector recv_buf(buffer_size); + int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); @@ -138,7 +139,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r } else { - int words = send_buf.size(); + int words = buffer_size; if (cbmask != 0x3) words=words>>1; int bytes = words * sizeof(vobj); @@ -150,12 +151,14 @@ template void Cshift_comms(Lattice &ret,const Lattice &r int xmit_to_rank; grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + grid->Barrier(); grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, (void *)&recv_buf[0], recv_from_rank, bytes); + grid->Barrier(); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); @@ -195,8 +198,15 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - std::vector > send_buf_extract(Nsimd,commVector(buffer_size) ); - std::vector > recv_buf_extract(Nsimd,commVector(buffer_size) ); + std::vector > send_buf_extract(Nsimd); + std::vector > recv_buf_extract(Nsimd); + scalar_object * recv_buf_extract_mpi; + scalar_object * send_buf_extract_mpi; + + for(int s=0;s void Cshift_comms_simd(Lattice &ret,const LatticeShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0], + grid->Barrier(); + + send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; + recv_buf_extract_mpi = &recv_buf_extract[i][0]; + grid->SendToRecvFrom((void *)send_buf_extract_mpi, xmit_to_rank, - (void *)&recv_buf_extract[i][0], + (void *)recv_buf_extract_mpi, recv_from_rank, bytes); + + grid->Barrier(); + + rpointers[i] = &recv_buf_extract[i][0]; + } else { + rpointers[i] = &send_buf_extract[nbr_lane][0]; + } + + } + Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); + } + +} +#else +template void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) +{ + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + GridBase *grid=rhs.Grid(); + Lattice temp(rhs.Grid()); + + int fd = rhs.Grid()->_fdimensions[dimension]; + int rd = rhs.Grid()->_rdimensions[dimension]; + int pd = rhs.Grid()->_processors[dimension]; + int simd_layout = rhs.Grid()->_simd_layout[dimension]; + int comm_dim = rhs.Grid()->_processors[dimension] >1 ; + assert(simd_layout==1); + assert(comm_dim==1); + assert(shift>=0); + assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; + cshiftVector send_buf_v(buffer_size); + cshiftVector recv_buf_v(buffer_size); + vobj *send_buf; + vobj *recv_buf; + { + grid->ShmBufferFreeAll(); + size_t bytes = buffer_size*sizeof(vobj); + send_buf=(vobj *)grid->ShmBufferMalloc(bytes); + recv_buf=(vobj *)grid->ShmBufferMalloc(bytes); + } + + int cb= (cbmask==0x2)? Odd : Even; + int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); + + for(int x=0;x>1; + + int bytes = words * sizeof(vobj); + + Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); + + // int rank = grid->_processor; + int recv_from_rank; + int xmit_to_rank; + grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + + + grid->Barrier(); + + acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); + grid->SendToRecvFrom((void *)&send_buf[0], + xmit_to_rank, + (void *)&recv_buf[0], + recv_from_rank, + bytes); + acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); + + grid->Barrier(); + + Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); + } + } +} + +template void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) +{ + GridBase *grid=rhs.Grid(); + const int Nsimd = grid->Nsimd(); + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_object scalar_object; + typedef typename vobj::scalar_type scalar_type; + + int fd = grid->_fdimensions[dimension]; + int rd = grid->_rdimensions[dimension]; + int ld = grid->_ldimensions[dimension]; + int pd = grid->_processors[dimension]; + int simd_layout = grid->_simd_layout[dimension]; + int comm_dim = grid->_processors[dimension] >1 ; + + //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<=0); + assert(shiftPermuteType(dimension); + + /////////////////////////////////////////////// + // Simd direction uses an extract/merge pair + /////////////////////////////////////////////// + int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; + // int words = sizeof(vobj)/sizeof(vector_type); + + std::vector > send_buf_extract(Nsimd); + std::vector > recv_buf_extract(Nsimd); + scalar_object * recv_buf_extract_mpi; + scalar_object * send_buf_extract_mpi; + { + size_t bytes = sizeof(scalar_object)*buffer_size; + grid->ShmBufferFreeAll(); + send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); + recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); + } + for(int s=0;s pointers(Nsimd); // + ExtractPointerArray rpointers(Nsimd); // received pointers + + /////////////////////////////////////////// + // Work out what to send where + /////////////////////////////////////////// + int cb = (cbmask==0x2)? Odd : Even; + int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); + + // loop over outer coord planes orthog to dim + for(int x=0;x>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; + + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; + int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors + + int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer + int nbr_ox = (nbr_coor%rd); // outer coord of peer + int nbr_lane = (i&(~inner_bit)); + + int recv_from_rank; + int xmit_to_rank; + + if (nbr_ic) nbr_lane|=inner_bit; + + assert (sx == nbr_ox); + + if(nbr_proc){ + grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + grid->Barrier(); + + acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); + grid->SendToRecvFrom((void *)send_buf_extract_mpi, + xmit_to_rank, + (void *)recv_buf_extract_mpi, + recv_from_rank, + bytes); + acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); + grid->Barrier(); rpointers[i] = &recv_buf_extract[i][0]; } else { @@ -258,7 +461,7 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice Date: Fri, 13 Nov 2020 01:38:54 +0100 Subject: [PATCH 26/56] Option for host or device Cshift implementation --- Grid/allocator/AlignedAllocator.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 249732fb..4b357523 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -165,9 +165,17 @@ template inline bool operator!=(const devAllocator<_Tp>&, const d //////////////////////////////////////////////////////////////////////////////// // Template typedefs //////////////////////////////////////////////////////////////////////////////// -//template using commAllocator = devAllocator; +#ifdef ACCELERATOR_CSHIFT +// Cshift on device +template using cshiftAllocator = devAllocator; +#else +// Cshift on host +template using cshiftAllocator = std::allocator; +#endif + template using Vector = std::vector >; template using commVector = std::vector >; +template using cshiftVector = std::vector >; NAMESPACE_END(Grid); From b13d1f72389a8a7b9cde67f1426bd2ae1b3a2e02 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Nov 2020 03:49:44 +0100 Subject: [PATCH 27/56] TOFU compat flag to help Isaaku --- Grid/GridStd.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Grid/GridStd.h b/Grid/GridStd.h index ecb561ea..28f6bc46 100644 --- a/Grid/GridStd.h +++ b/Grid/GridStd.h @@ -28,4 +28,7 @@ /////////////////// #include "Config.h" +#ifdef TOFU +#undef GRID_COMMS_THREADS +#endif #endif /* GRID_STD_H */ From 6e313575bed12d65f40785d5bc8af05d6a6e5a6f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Nov 2020 03:50:16 +0100 Subject: [PATCH 28/56] Use of default GPU is behaviour, not a system property. Move Summit specific to configure.ac --- Grid/threads/Accelerator.cc | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 2134d158..bd13e04c 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -48,7 +48,7 @@ void acceleratorInit(void) prop = gpu_props[i]; totalDeviceMem = prop.totalGlobalMem; if ( world_rank == 0) { -#ifndef GRID_IBM_SUMMIT +#ifndef GRID_DEFAULT_GPU if ( i==rank ) { printf("AcceleratorCudaInit[%d]: ========================\n",rank); printf("AcceleratorCudaInit[%d]: Device Number : %d\n", rank,i); @@ -73,11 +73,17 @@ void acceleratorInit(void) #undef GPU_PROP_FMT #undef GPU_PROP -#ifdef GRID_IBM_SUMMIT +#ifdef GRID_DEFAULT_GPU // IBM Jsrun makes cuda Device numbering screwy and not match rank - if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - use default device\n"); + if ( world_rank == 0 ) { + printf("AcceleratorCudaInit: using default device \n"); + printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n"); + printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n"); + printf("AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no \n"); + } #else printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank); + printf("AcceleratorCudaInit: Configure options --enable-select-gpu=yes \n"); cudaSetDevice(rank); #endif if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n"); @@ -139,11 +145,18 @@ void acceleratorInit(void) MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours #undef GPU_PROP_FMT #undef GPU_PROP -#ifdef GRID_IBM_SUMMIT - // IBM Jsrun makes cuda Device numbering screwy and not match rank - if ( world_rank == 0 ) printf("AcceleratorHipInit: IBM Summit or similar - NOT setting device to node rank\n"); + +#ifdef GRID_DEFAULT_GPU + if ( world_rank == 0 ) { + printf("AcceleratorHipInit: using default device \n"); + printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n"); + printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n"); + } #else - if ( world_rank == 0 ) printf("AcceleratorHipInit: setting device to node rank\n"); + if ( world_rank == 0 ) { + printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank); + printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n"); + } hipSetDevice(rank); #endif if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n"); From cf23eff60eb387d26d490fbbe8ed8ba0e32776cd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Nov 2020 03:51:08 +0100 Subject: [PATCH 29/56] Device to Device, Memset, cannot assume UVM == Communicable --- Grid/threads/Accelerator.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index da8a85b0..75d557fd 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -166,15 +166,18 @@ inline void *acceleratorAllocDevice(size_t bytes) inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} +inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);} inline int acceleratorIsCommunicable(void *ptr) { - int uvm; - auto - cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr); - assert(cuerr == cudaSuccess ); - if(uvm) return 0; - else return 1; + // int uvm=0; + // auto + // cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr); + // assert(cuerr == cudaSuccess ); + // if(uvm) return 0; + // else return 1; + return 1; } #endif @@ -229,8 +232,10 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; +inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();} inline int acceleratorIsCommunicable(void *ptr) { #if 0 @@ -332,6 +337,8 @@ inline void acceleratorFreeShared(void *ptr){ free(ptr);}; inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} +inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);} #endif @@ -369,8 +376,10 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemc accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} +inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline int acceleratorIsCommunicable(void *ptr){ return 1; } +inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} #ifdef HAVE_MM_MALLOC_H inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; @@ -393,6 +402,8 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN, inline void acceleratorFreeCpu (void *ptr){free(ptr);}; #endif + + /////////////////////////////////////////////////// // Synchronise across local threads for divergence resynch /////////////////////////////////////////////////// From d05ce01809444087cceaf22c86c37d8977024c2a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Nov 2020 03:52:19 +0100 Subject: [PATCH 30/56] TOFU behaviour now optional THREAD_MULTIPLE or THREAD_SERIALIZED --- Grid/communicator/Communicator_mpi3.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 83f71233..c6543851 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -44,7 +44,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { -#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori +#ifndef GRID_COMMS_THREADS nCommThreads=1; // wrong results here too // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs @@ -358,16 +358,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector Date: Fri, 13 Nov 2020 03:57:58 +0100 Subject: [PATCH 31/56] Work on 2,2,2,8 ranks --- benchmarks/Benchmark_ITT.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 538366d7..8495bbc5 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -445,7 +445,11 @@ public: // 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8 // 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2 // double flops=(1344.0*volume)/2; +#if 1 double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2; +#else + double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns + 2*Nd*Nc*Ns*2; +#endif double flops=(fps*volume)/2; double mf_hi, mf_lo, mf_err; @@ -498,8 +502,9 @@ public: int threads = GridThread::GetThreads(); Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); - GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi()); uint64_t NP = TmpGrid->RankCount(); From e9bc7488280a824750279bdfafcd2891eeb65c37 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Nov 2020 03:58:34 +0100 Subject: [PATCH 32/56] Useful GPU machine benchmark for GDR used to shakeout Booster at Juelich - see slack earlyaccess channel --- benchmarks/Benchmark_comms_host_device.cc | 260 ++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 benchmarks/Benchmark_comms_host_device.cc diff --git a/benchmarks/Benchmark_comms_host_device.cc b/benchmarks/Benchmark_comms_host_device.cc new file mode 100644 index 00000000..591b5597 --- /dev/null +++ b/benchmarks/Benchmark_comms_host_device.cc @@ -0,0 +1,260 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_comms.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +struct time_statistics{ + double mean; + double err; + double min; + double max; + + void statistics(std::vector v){ + double sum = std::accumulate(v.begin(), v.end(), 0.0); + mean = sum / v.size(); + + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); + + auto result = std::minmax_element(v.begin(), v.end()); + min = *result.first; + max = *result.second; +} +}; + +void header(){ + std::cout <1) nmu++; + + std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; + std::vector t_time(Nloop); + time_statistics timestat; + + std::cout< > xbuf(8); + std::vector > rbuf(8); + + for(int mu=0;mu<8;mu++){ + xbuf[mu].resize(lat*lat*lat*Ls); + rbuf[mu].resize(lat*lat*lat*Ls); + } + uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + + int ncomm; + + for(int mu=0;mu<4;mu++){ + if (mpi_layout[mu]>1 ) { + double start=usecond(); + for(int i=0;i requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); + } + + comm_proc = mpi_layout[mu]-1; + { + std::vector requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); + } + } + Grid.Barrier(); + double stop=usecond(); + double mean=(stop-start)/Nloop; + double dbytes = bytes*ppn; + double xbytes = dbytes*2.0*ncomm; + double rbytes = xbytes; + double bidibytes = xbytes+rbytes; + + std::cout< xbuf(8); + std::vector rbuf(8); + + uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + } + + int ncomm; + + for(int mu=0;mu<4;mu++){ + if (mpi_layout[mu]>1 ) { + double start=usecond(); + for(int i=0;i requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); + } + + comm_proc = mpi_layout[mu]-1; + { + std::vector requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); + } + } + Grid.Barrier(); + double stop=usecond(); + double mean=(stop-start)/Nloop; + double dbytes = bytes*ppn; + double xbytes = dbytes*2.0*ncomm; + double rbytes = xbytes; + double bidibytes = xbytes+rbytes; + + std::cout< Date: Fri, 13 Nov 2020 03:59:36 +0100 Subject: [PATCH 33/56] Must ask for COMMMS_THREADS --- Grid/util/Init.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 37d16176..9be39e94 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -473,11 +473,13 @@ void Grid_init(int *argc,char ***argv) LebesgueOrder::UseLebesgueOrder=1; } CartesianCommunicator::nCommThreads = 1; +#ifdef GRID_COMMS_THREADS if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads"); GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads); assert(CartesianCommunicator::nCommThreads > 0); } +#endif if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); GridCmdOptionIntVector(arg,LebesgueOrder::Block); From 18ef8056ecb846245ab8b6a7d3071dcbbea889af Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Nov 2020 04:10:40 +0100 Subject: [PATCH 34/56] Hide Shared Memory --- Grid/communicator/SharedMemoryMPI.cc | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 0cbde9eb..5200b65c 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -457,8 +457,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; exit(EXIT_FAILURE); } - if ( WorldRank == 0 ){ - std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes + // if ( WorldRank == 0 ){ + if ( 1 ){ + std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes << "bytes at "<< std::hex<< ShmCommBuf < ranks(size); for(int r=0;r Date: Fri, 13 Nov 2020 04:11:03 +0100 Subject: [PATCH 35/56] Update options and simplify --- configure.ac | 88 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 31 deletions(-) diff --git a/configure.ac b/configure.ac index cee2a84c..9a020a7a 100644 --- a/configure.ac +++ b/configure.ac @@ -153,18 +153,28 @@ case ${ac_SFW_FP16} in AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);; esac -############### SUMMIT JSRUN -AC_ARG_ENABLE([summit], - [AC_HELP_STRING([--enable-summit=yes|no], [enable IBMs jsrun resource manager for SUMMIT])], - [ac_SUMMIT=${enable_summit}], [ac_SUMMIT=no]) -case ${ac_SUMMIT} in - no);; +############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons +AC_ARG_ENABLE([accelerator-cshift], + [AC_HELP_STRING([--enable-accelerator-cshift=yes|no], [run cshift on the device])], + [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes]) + +AC_ARG_ENABLE([ucx-buggy], + [AC_HELP_STRING([--enable-ucx-buggy=yes|no], [enable workaround for UCX device buffer bugs])], + [ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no]) + +case ${ac_UCXBUGGY} in yes) - AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);; - *) - AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);; + ac_ACC_CSHIFT=no;; + *);; esac +case ${ac_ACC_CSHIFT} in + yes) + AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);; + *);; +esac + + ############### SYCL/CUDA/HIP/none AC_ARG_ENABLE([accelerator], [AC_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none], [enable none,cuda,sycl,hip acceleration])], @@ -181,8 +191,9 @@ case ${ac_ACCELERATOR} in echo HIP acceleration AC_DEFINE([GRID_HIP],[1],[Use HIP offload]);; none) - echo NO acceleration - ;; + echo NO acceleration ;; + no) + echo NO acceleration ;; *) AC_MSG_ERROR(["Acceleration not suppoorted ${ac_ACCELERATOR}"]);; esac @@ -477,28 +488,26 @@ esac AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS" AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS" -############### Precision selection - deprecate -#AC_ARG_ENABLE([precision], -# [AC_HELP_STRING([--enable-precision=single|double], -# [Select default word size of Real])], -# [ac_PRECISION=${enable_precision}],[ac_PRECISION=double]) - +###### PRECISION ALWAYS DOUBLE AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] ) -#case ${ac_PRECISION} in -# single) -# AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] ) -# ;; -# double) -# ;; -# *) -# AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]); -# ;; -#esac +######################################################### +###################### set GPU device to rank in node ## +######################################################### +AC_ARG_ENABLE([setdevice],[AC_HELP_STRING([--enable-setdevice | --disable-setdevice], + [Set GPU to rank in node with cudaSetDevice or similar])],[ac_SETDEVICE=${enable_SETDEVICE}],[ac_SETDEVICE=no]) +case ${ac_SETDEVICE} in + yes);; + *) + AC_DEFINE([GRID_DEFAULT_GPU],[1],[GRID_DEFAULT_GPU] ) + ;; +esac -###################### Shared memory allocation technique under MPI3 -AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone], - [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen]) +######################################################### +###################### Shared memory intranode ######### +######################################################### +AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone|nvlink|no], + [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=no]) case ${ac_SHM} in @@ -517,8 +526,12 @@ case ${ac_SHM} in AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] ) ;; - shmnone) + shmnone | no) AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] ) + AC_DEFINE([GRID_SHM_DISABLE],[1],[USE MPI for intranode comms]);; + + nvlink) + AC_DEFINE([GRID_MPI3_SHM_NVLINK],[1],[GRID_MPI3_SHM_NVLINK] ) ;; hugetlbfs) @@ -537,10 +550,23 @@ AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path], [ac_SHMPATH=/var/lib/hugetlbfs/global/pagesize-2MB/]) AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing]) + +############### communication type selection +AC_ARG_ENABLE([comms-threads],[AC_HELP_STRING([--enable-comms-threads | --disable-comms-threads], + [Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes]) + +case ${ac_COMMS_THREADS} in + yes) + AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] ) + ;; + *) ;; +esac + ############### communication type selection AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto], [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none]) + case ${ac_COMMS} in none) AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] ) From 3aab983760b188bbaf4c0523d8f1faa39ed84919 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 16 Nov 2020 17:13:58 +0100 Subject: [PATCH 36/56] Flop count set as in DiRAC-ITT-2020 (mistaken 20% low, but must maintain consistency) --- benchmarks/Benchmark_ITT.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 8ab26fc1..5d602ce9 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -445,7 +445,7 @@ public: // 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8 // 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2 // double flops=(1344.0*volume)/2; -#if 1 +#if 0 double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2; #else double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns + 2*Nd*Nc*Ns*2; @@ -512,7 +512,6 @@ public: NN_global=NN; uint64_t SHM=NP/NN; - Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); ///////// Welcome message //////////// std::cout< Date: Mon, 16 Nov 2020 18:07:15 -0500 Subject: [PATCH 37/56] Switch off SHM paths with --disable-shm --- Grid/communicator/SharedMemoryMPI.cc | 2 +- configure.ac | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 5200b65c..4b440fc0 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -772,7 +772,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) std::vector ranks(size); for(int r=0;r Date: Mon, 16 Nov 2020 20:15:50 -0500 Subject: [PATCH 38/56] --shm-force-mpi --- Grid/communicator/SharedMemoryMPI.cc | 7 ++++--- configure.ac | 9 +++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 4b440fc0..6089093b 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -666,7 +666,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) #endif void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); - // std::cout << "Set WorldShmCommBufs["< ranks(size); for(int r=0;r Date: Tue, 17 Nov 2020 04:41:15 -0800 Subject: [PATCH 39/56] Build without LIME --- benchmarks/Benchmark_IO.cc | 5 ++++- benchmarks/Benchmark_IO.hpp | 2 +- benchmarks/Benchmark_IO_vs_dir.cc | 5 ++++- scripts/filelist | 7 +++---- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/benchmarks/Benchmark_IO.cc b/benchmarks/Benchmark_IO.cc index 0d80d425..87e7224d 100644 --- a/benchmarks/Benchmark_IO.cc +++ b/benchmarks/Benchmark_IO.cc @@ -1,4 +1,3 @@ - #include "Benchmark_IO.hpp" #ifndef BENCH_IO_LMIN @@ -13,6 +12,7 @@ #define BENCH_IO_NPASS 10 #endif +#ifdef HAVE_LIME using namespace Grid; std::string filestem(const int l) @@ -196,3 +196,6 @@ int main (int argc, char ** argv) return EXIT_SUCCESS; } +#else +int main(int argc,char ** argv){} +#endif diff --git a/benchmarks/Benchmark_IO.hpp b/benchmarks/Benchmark_IO.hpp index c4a6ca58..2ff42d52 100644 --- a/benchmarks/Benchmark_IO.hpp +++ b/benchmarks/Benchmark_IO.hpp @@ -2,12 +2,12 @@ #define Benchmark_IO_hpp_ #include -#ifdef HAVE_LIME #define MSG std::cout << GridLogMessage #define SEP \ "-----------------------------------------------------------------------------" #define BIGSEP \ "=============================================================================" +#ifdef HAVE_LIME namespace Grid { diff --git a/benchmarks/Benchmark_IO_vs_dir.cc b/benchmarks/Benchmark_IO_vs_dir.cc index e030bc39..8252547b 100644 --- a/benchmarks/Benchmark_IO_vs_dir.cc +++ b/benchmarks/Benchmark_IO_vs_dir.cc @@ -1,5 +1,5 @@ #include "Benchmark_IO.hpp" - +#ifdef HAVE_LIME using namespace Grid; int main (int argc, char ** argv) @@ -97,3 +97,6 @@ int main (int argc, char ** argv) return EXIT_SUCCESS; } +#else +int main(int argc,char ** argv){} +#endif diff --git a/scripts/filelist b/scripts/filelist index 78747315..27425a3e 100755 --- a/scripts/filelist +++ b/scripts/filelist @@ -26,11 +26,10 @@ for subdir in $dirs; do echo "tests-local: ${TESTLIST} " > Make.inc echo ${PREF}_PROGRAMS = ${TESTLIST} >> Make.inc echo >> Make.inc - HADLINK=`[ $subdir = './hadrons' ] && echo '-lHadrons '` for f in $TESTS; do BNAME=`basename $f .cc` echo ${BNAME}_SOURCES=$f >> Make.inc - echo ${BNAME}_LDADD=${HADLINK}-lGrid >> Make.inc + echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a' >> Make.inc echo >> Make.inc done if [ $subdir != '.' ]; then @@ -49,7 +48,7 @@ echo >> Make.inc for f in $TESTS; do BNAME=`basename $f .cc` echo ${BNAME}_SOURCES=$f >> Make.inc - echo ${BNAME}_LDADD=-lGrid>> Make.inc + echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a' >> Make.inc echo >> Make.inc done cd .. @@ -65,7 +64,7 @@ echo >> Make.inc for f in $TESTS; do BNAME=`basename $f .cc` echo ${BNAME}_SOURCES=$f >> Make.inc - echo ${BNAME}_LDADD=-lGrid>> Make.inc + echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a'>> Make.inc echo >> Make.inc done cd .. From 804a810d68df3dcd285798e4dd325224e1ef2470 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 18 Nov 2020 03:06:53 +0000 Subject: [PATCH 40/56] Wildcard mismatch --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index a387be8a..5b22309d 100644 --- a/configure.ac +++ b/configure.ac @@ -498,7 +498,7 @@ AC_ARG_ENABLE([setdevice],[AC_HELP_STRING([--enable-setdevice | --disable-setdev [Set GPU to rank in node with cudaSetDevice or similar])],[ac_SETDEVICE=${enable_SETDEVICE}],[ac_SETDEVICE=no]) case ${ac_SETDEVICE} in yes);; - *) + no) AC_DEFINE([GRID_DEFAULT_GPU],[1],[GRID_DEFAULT_GPU] ) ;; esac From 5adae5d6ff223fb52b6d69d87d0446f6c865a499 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 19 Nov 2020 19:22:12 +0100 Subject: [PATCH 41/56] Unused variable remove --- Grid/tensors/Tensor_Ta.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Grid/tensors/Tensor_Ta.h b/Grid/tensors/Tensor_Ta.h index f7af85b7..bbaa4a00 100644 --- a/Grid/tensors/Tensor_Ta.h +++ b/Grid/tensors/Tensor_Ta.h @@ -92,7 +92,6 @@ accelerator_inline iMatrix ProjectOnGroup(const iMatrix &arg) { // need a check for the group type? iMatrix ret(arg); - vtype rnrm; vtype nrm; vtype inner; for(int c1=0;c1 Date: Thu, 19 Nov 2020 19:23:03 +0100 Subject: [PATCH 42/56] Warning remove --- Grid/allocator/MemoryManagerShared.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc index 537f7c32..3f165007 100644 --- a/Grid/allocator/MemoryManagerShared.cc +++ b/Grid/allocator/MemoryManagerShared.cc @@ -1,7 +1,6 @@ #include #ifdef GRID_UVM -#warning "Grid is assuming unified virtual memory address space" NAMESPACE_BEGIN(Grid); ///////////////////////////////////////////////////////////////////////////////// // View management is 1:1 address space mapping From d5049949a45cee89cb6ff2de004fe719c36db8b6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 19 Nov 2020 19:23:41 +0100 Subject: [PATCH 43/56] Starting to fix reunitarise --- Grid/qcd/utils/SUn.h | 84 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h index 7ac53246..69ab4ebb 100644 --- a/Grid/qcd/utils/SUn.h +++ b/Grid/qcd/utils/SUn.h @@ -735,7 +735,6 @@ public: } } - template static void HotConfiguration(GridParallelRNG &pRNG, GaugeField &out) { typedef typename GaugeField::vector_type vector_type; @@ -800,6 +799,89 @@ public: } }; +template +LatticeComplexD Determinant(const Lattice > > > &Umu) +{ + GridBase *grid=Umu.Grid(); + auto lvol = grid->lSites(); + LatticeComplexD ret(grid); + + autoView(Umu_v,Umu,CpuRead); + autoView(ret_v,ret,CpuWrite); + thread_for(site,lvol,{ + Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N); + Coordinate lcoor; + grid->LocalIndexToLocalCoor(site, lcoor); + iScalar > > Us; + peekLocalSite(Us, Umu_v, lcoor); + for(int i=0;i +static void ProjectSUn(Lattice > > > &Umu) +{ + Umu = ProjectOnGroup(Umu); + auto det = Determinant(Umu); + + det = pow(det,-1); + + for(int i=0;i(Umu,N-1,i); + element = element * det; + PokeIndex(Umu,element,Nc-1,i); + } +} +template +static void ProjectSUn(Lattice >,Nd> > &U) +{ + GridBase *grid=U.Grid(); + // Reunitarise + for(int mu=0;mu(U,mu); + Umu = ProjectOnGroup(Umu); + ProjectSUn(Umu); + PokeIndex(U,Umu,mu); + } +} +// Explicit specialisation for SU(3). +// Explicit specialisation for SU(3). +static void +ProjectSU3 (Lattice > > > &Umu) +{ + GridBase *grid=Umu.Grid(); + const int x=0; + const int y=1; + const int z=2; + // Reunitarise + Umu = ProjectOnGroup(Umu); + autoView(Umu_v,Umu,CpuWrite); + thread_for(ss,grid->oSites(),{ + auto cm = Umu_v[ss]; + cm()()(2,x) = adj(cm()()(0,y)*cm()()(1,z)-cm()()(0,z)*cm()()(1,y)); //x= yz-zy + cm()()(2,y) = adj(cm()()(0,z)*cm()()(1,x)-cm()()(0,x)*cm()()(1,z)); //y= zx-xz + cm()()(2,z) = adj(cm()()(0,x)*cm()()(1,y)-cm()()(0,y)*cm()()(1,x)); //z= xy-yx + Umu_v[ss]=cm; + }); +} +static void ProjectSU3(Lattice >,Nd> > &U) +{ + GridBase *grid=U.Grid(); + // Reunitarise + for(int mu=0;mu(U,mu); + Umu = ProjectOnGroup(Umu); + ProjectSU3(Umu); + PokeIndex(U,Umu,mu); + } +} + typedef SU<2> SU2; typedef SU<3> SU3; typedef SU<4> SU4; From aace3d47b993a6269d6b53ae3705aa9cf299bd16 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 19 Nov 2020 19:24:14 +0100 Subject: [PATCH 44/56] partial work in progress --- tests/core/Test_reunitarise.cc | 137 +++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 tests/core/Test_reunitarise.cc diff --git a/tests/core/Test_reunitarise.cc b/tests/core/Test_reunitarise.cc new file mode 100644 index 00000000..3e78b961 --- /dev/null +++ b/tests/core/Test_reunitarise.cc @@ -0,0 +1,137 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_quenched_update.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt({8,8,8,8}); + GridCartesian * grid = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexD::Nsimd()), + GridDefaultMpi()); + + GridCartesian * gridF = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexF::Nsimd()), + GridDefaultMpi()); + + + /////////////////////////////// + // Configuration of known size + /////////////////////////////// + LatticeColourMatrixD ident(grid); + LatticeColourMatrixD U(grid); + LatticeColourMatrixD UU(grid); + LatticeColourMatrixD tmp(grid); + LatticeColourMatrixD org(grid); + LatticeColourMatrixF UF(gridF); + + LatticeGaugeField Umu(grid); + + ident =1.0; + + // RNG set up for test + std::vector pseeds({1,2,3,4,5}); // once I caught a fish alive + std::vector sseeds({6,7,8,9,10});// then i let it go again + GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds); + + SU::HotConfiguration(pRNG,Umu); + + U = PeekIndex(Umu,0); + org=U; + + + tmp= U*adj(U) - ident ; + RealD Def1 = norm2( tmp ); + std::cout << " Defect1 "<(U,Nc-1,i); + element = element * phase; + PokeIndex(U,element,Nc-1,i); + } + UU=U; + + detU= Determinant(U) ; + std::cout << "Determinant after screw up " <(UU); + detUU= Determinant(UU); + std::cout << "Determinant ProjectSUn " < Date: Fri, 20 Nov 2020 16:48:28 +0100 Subject: [PATCH 45/56] Configurable ALLOC_ALIGN and ALLOC_CACHE --- Grid/allocator/MemoryManager.h | 2 -- configure.ac | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index aac13aee..25c5b5f5 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -34,8 +34,6 @@ NAMESPACE_BEGIN(Grid); // Move control to configure.ac and Config.h? -#define ALLOCATION_CACHE -#define GRID_ALLOC_ALIGN (2*1024*1024) #define GRID_ALLOC_SMALL_LIMIT (4096) /*Pinning pages is costly*/ diff --git a/configure.ac b/configure.ac index 5b22309d..4d16d776 100644 --- a/configure.ac +++ b/configure.ac @@ -491,6 +491,28 @@ AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS" ###### PRECISION ALWAYS DOUBLE AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] ) +######################################################### +###################### GRID ALLOCATOR ALIGNMENT ## +######################################################### +AC_ARG_ENABLE([alloc-align],[AC_HELP_STRING([--enable-alloc-align=2MB|4k], + [Alignment in bytes of GRID Allocator ])],[ac_ALLOC_ALIGN=${enable_alloc_align}],[ac_ALLOC_ALIGN=2MB]) +case ${ac_ALLOC_ALIGN} in + 4k) + AC_DEFINE([GRID_ALLOC_ALIGN],[(4096)],[GRID_ALLOC_ALIGN]);; + 2MB) + AC_DEFINE([GRID_ALLOC_ALIGN],[(2*1024*1024)],[GRID_ALLOC_ALIGN]);; + *);; +esac + +AC_ARG_ENABLE([alloc-cache],[AC_HELP_STRING([--enable-alloc-cache ], + [Cache a pool of recent "frees" to reuse])],[ac_ALLOC_CACHE=${enable_alloc_cache}],[ac_ALLOC_CACHE=yes]) +case ${ac_ALLOC_CACHE} in + yes) + AC_DEFINE([ALLOCATION_CACHE],[1],[ALLOCATION_CACHE]);; + *);; +esac + + ######################################################### ###################### set GPU device to rank in node ## ######################################################### From 86e8b9fe387a922e13d4cfe67a2dbd5554f6ed46 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 20 Nov 2020 17:07:16 +0100 Subject: [PATCH 46/56] ALLOC_ALIGN removed --- Grid/threads/Accelerator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 5f5cd5fe..6232aea8 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -361,7 +361,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas ////////////////////////////////////////////// // CPU Target - No accelerator just thread instead ////////////////////////////////////////////// -#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned + #if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) ) #undef GRID_SIMT From 147dc15d26da963d5c033701ad9e9211dcba50d3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 20 Nov 2020 13:13:59 -0500 Subject: [PATCH 47/56] Update --- benchmarks/Benchmark_ITT.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 5d602ce9..032535b3 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -445,7 +445,7 @@ public: // 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8 // 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2 // double flops=(1344.0*volume)/2; -#if 0 +#if 1 double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2; #else double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns + 2*Nd*Nc*Ns*2; From d4861a362ccaca6bbf300da450f46c10ea40ed29 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 23 Nov 2020 15:38:49 +0000 Subject: [PATCH 48/56] Stencil use non-UVM memory for look up table on enable-shared=no --- Grid/stencil/Stencil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 1e198972..23fc8203 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -269,7 +269,7 @@ public: std::vector > > face_table ; Vector surface_list; - Vector _entries; // Resident in managed memory + stencilVector _entries; // Resident in managed memory std::vector Packets; std::vector Mergers; std::vector MergersSHM; From 683a5e5bf556f97824fd242f76d0a81ab8dd7bd8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 23 Nov 2020 15:39:51 +0000 Subject: [PATCH 49/56] Stencil use host vector for integera table on enable-shared=no and mirror it on device --- Grid/allocator/AlignedAllocator.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 4b357523..91622789 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -173,7 +173,8 @@ template using cshiftAllocator = devAllocator; template using cshiftAllocator = std::allocator; #endif -template using Vector = std::vector >; +template using Vector = std::vector >; +template using stencilVector = std::vector >; template using commVector = std::vector >; template using cshiftVector = std::vector >; From 97e264d0ff6562008be60ee4f3ba355198c7f247 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 23 Nov 2020 15:46:11 +0000 Subject: [PATCH 50/56] Christoph's changes --- Grid/allocator/MemoryManagerCache.cc | 36 +++++++++++++++++----------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 5dd7575e..50076eba 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -1,11 +1,12 @@ #include - #ifndef GRID_UVM #warning "Using explicit device memory copies" NAMESPACE_BEGIN(Grid); +//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout); #define dprintf(...) + //////////////////////////////////////////////////////////// // For caching copies of data on device //////////////////////////////////////////////////////////// @@ -103,7 +104,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - // dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); assert(AccCache.CpuPtr!=(uint64_t)NULL); @@ -111,7 +112,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - // dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); + dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -125,7 +126,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - // dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); if(AccCache.state==AccDirty) { @@ -136,7 +137,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - // dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); + dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -149,7 +150,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); - // dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -164,7 +165,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - // dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; @@ -227,18 +228,24 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,hint); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - + if (!AccCache.AccPtr) + EvictVictims(bytes); + assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { + dprintf("ViewOpen found entry %llx %llx : %lld %lld\n", + (uint64_t)AccCache.CpuPtr, + (uint64_t)CpuPtr, + (uint64_t)AccCache.bytes, + (uint64_t)bytes); assert(AccCache.CpuPtr == CpuPtr); assert(AccCache.bytes ==bytes); } @@ -285,21 +292,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // CpuDirty + AccRead => Consistent } AccCache.accLock++; - // printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); } else if(AccCache.state==Consistent) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty else AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.accLock++; - // printf("Consistent entry into device accLock %d\n",AccCache.accLock); + dprintf("Consistent entry into device accLock %d\n",AccCache.accLock); } else if(AccCache.state==AccDirty) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty else AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.accLock++; - // printf("AccDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock); } else { assert(0); } @@ -361,13 +368,14 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,transient); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - + if (!AccCache.AccPtr) + EvictVictims(bytes); + assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache.accLock==0); // Programming error From 321f0f51b59109f9cb2b17d0e0f6a1883076be54 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 24 Nov 2020 21:46:10 -0500 Subject: [PATCH 51/56] Project to SU(N) --- Grid/qcd/action/gauge/GaugeImplTypes.h | 4 ++++ Grid/qcd/action/scalar/ScalarImpl.h | 8 ++++++++ Grid/qcd/hmc/HMC.h | 2 +- Grid/qcd/hmc/integrators/Integrator.h | 2 ++ Grid/qcd/utils/SUn.h | 5 ++--- tests/core/Test_reunitarise.cc | 19 +++++++++++++------ 6 files changed, 30 insertions(+), 10 deletions(-) diff --git a/Grid/qcd/action/gauge/GaugeImplTypes.h b/Grid/qcd/action/gauge/GaugeImplTypes.h index 9b7d5a60..55a20eca 100644 --- a/Grid/qcd/action/gauge/GaugeImplTypes.h +++ b/Grid/qcd/action/gauge/GaugeImplTypes.h @@ -154,6 +154,10 @@ public: return Hsum.real(); } + static inline void Project(Field &U) { + ProjectSUn(U); + } + static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { SU::HotConfiguration(pRNG, U); } diff --git a/Grid/qcd/action/scalar/ScalarImpl.h b/Grid/qcd/action/scalar/ScalarImpl.h index 14675b11..403ea573 100644 --- a/Grid/qcd/action/scalar/ScalarImpl.h +++ b/Grid/qcd/action/scalar/ScalarImpl.h @@ -54,6 +54,10 @@ public: static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { U = 1.0; } + + static inline void Project(Field &U) { + return; + } static void MomentumSpacePropagator(Field &out, RealD m) { @@ -234,6 +238,10 @@ public: #endif //USE_FFT_ACCELERATION } + static inline void Project(Field &U) { + return; + } + static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U); } diff --git a/Grid/qcd/hmc/HMC.h b/Grid/qcd/hmc/HMC.h index 0f933204..f168b69a 100644 --- a/Grid/qcd/hmc/HMC.h +++ b/Grid/qcd/hmc/HMC.h @@ -95,7 +95,7 @@ private: typedef typename IntegratorType::Field Field; typedef std::vector< HmcObservable * > ObsListType; - + //pass these from the resource manager GridSerialRNG &sRNG; GridParallelRNG &pRNG; diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index d5475704..70055754 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -313,6 +313,8 @@ public: std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; } + FieldImplementation::Project(U); + // and that we indeed got to the end of the trajectory assert(fabs(t_U - Params.trajL) < 1.0e-6); diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h index 69ab4ebb..675493b3 100644 --- a/Grid/qcd/utils/SUn.h +++ b/Grid/qcd/utils/SUn.h @@ -820,7 +820,6 @@ LatticeComplexD Determinant(const Lattice }} ComplexD det = EigenU.determinant(); pokeLocalSite(det,ret_v,lcoor); - std::cout << " site " < > > > &Umu) Umu = ProjectOnGroup(Umu); auto det = Determinant(Umu); - det = pow(det,-1); - + det = conjugate(det); + for(int i=0;i(Umu,N-1,i); element = element * det; diff --git a/tests/core/Test_reunitarise.cc b/tests/core/Test_reunitarise.cc index 3e78b961..9a6781f1 100644 --- a/tests/core/Test_reunitarise.cc +++ b/tests/core/Test_reunitarise.cc @@ -102,7 +102,8 @@ int main (int argc, char ** argv) LatticeComplexD detUU(grid); detU= Determinant(U) ; - std::cout << "Determinant before screw up " <(UU); + ProjectSUn(UU); detUU= Determinant(UU); - std::cout << "Determinant ProjectSUn " < Date: Wed, 2 Dec 2020 17:55:30 -0800 Subject: [PATCH 52/56] Duplicate code --- Grid/allocator/AlignedAllocator.cc | 67 ------------------------------ 1 file changed, 67 deletions(-) delete mode 100644 Grid/allocator/AlignedAllocator.cc diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc deleted file mode 100644 index 0d1707d9..00000000 --- a/Grid/allocator/AlignedAllocator.cc +++ /dev/null @@ -1,67 +0,0 @@ -#include -#include - -NAMESPACE_BEGIN(Grid); - -MemoryStats *MemoryProfiler::stats = nullptr; -bool MemoryProfiler::debug = false; - -void check_huge_pages(void *Buf,uint64_t BYTES) -{ -#ifdef __linux__ - int fd = open("/proc/self/pagemap", O_RDONLY); - assert(fd >= 0); - const int page_size = 4096; - uint64_t virt_pfn = (uint64_t)Buf / page_size; - off_t offset = sizeof(uint64_t) * virt_pfn; - uint64_t npages = (BYTES + page_size-1) / page_size; - uint64_t pagedata[npages]; - uint64_t ret = lseek(fd, offset, SEEK_SET); - assert(ret == offset); - ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); - assert(ret == sizeof(uint64_t) * npages); - int nhugepages = npages / 512; - int n4ktotal, nnothuge; - n4ktotal = 0; - nnothuge = 0; - for (int i = 0; i < nhugepages; ++i) { - uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size; - for (int j = 0; j < 512; ++j) { - uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size; - ++n4ktotal; - if (pageaddr != baseaddr + j * page_size) - ++nnothuge; - } - } - int rank = CartesianCommunicator::RankWorld(); - printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); -#endif -} - -std::string sizeString(const size_t bytes) -{ - constexpr unsigned int bufSize = 256; - const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"}; - char buf[256]; - size_t s = 0; - double count = bytes; - - while (count >= 1024 && s < 7) - { - s++; - count /= 1024; - } - if (count - floor(count) == 0.0) - { - snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]); - } - else - { - snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]); - } - - return std::string(buf); -} - -NAMESPACE_END(Grid); - From cf76741ec651c41f1d1fa38d22e0474899691e72 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Dec 2020 03:47:11 -0800 Subject: [PATCH 53/56] Intel DPCPP Gold happy now (compiles all, runs Benchmark_dwf_fp32 ) --- Grid/DisableWarnings.h | 2 ++ Grid/Makefile.am | 12 ++++++++ Grid/communicator/SharedMemory.h | 2 +- Grid/communicator/SharedMemoryMPI.cc | 2 +- Grid/communicator/SharedMemoryNone.cc | 43 ++++++++++++++++++++++++++- Grid/lattice/Lattice_basis.h | 5 ++-- benchmarks/Benchmark_gparity.cc | 4 +-- configure.ac | 18 +++++++++++ scripts/filelist | 18 +++++++++-- 9 files changed, 97 insertions(+), 9 deletions(-) diff --git a/Grid/DisableWarnings.h b/Grid/DisableWarnings.h index 8ea219fb..4bd1edd0 100644 --- a/Grid/DisableWarnings.h +++ b/Grid/DisableWarnings.h @@ -37,7 +37,9 @@ directory #endif //disables and intel compiler specific warning (in json.hpp) +#ifdef __ICC #pragma warning disable 488 +#endif #ifdef __NVCC__ //disables nvcc specific warning in json.hpp diff --git a/Grid/Makefile.am b/Grid/Makefile.am index f1fa462e..ded6d146 100644 --- a/Grid/Makefile.am +++ b/Grid/Makefile.am @@ -21,6 +21,7 @@ if BUILD_HDF5 extra_headers+=serialisation/Hdf5Type.h endif + all: version-cache Version.h version-cache: @@ -53,6 +54,17 @@ Version.h: version-cache include Make.inc include Eigen.inc +extra_sources+=$(ZWILS_FERMION_FILES) +extra_sources+=$(WILS_FERMION_FILES) +extra_sources+=$(STAG_FERMION_FILES) +if BUILD_GPARITY + extra_sources+=$(GP_FERMION_FILES) +endif +if BUILD_FERMION_REPS + extra_sources+=$(ADJ_FERMION_FILES) + extra_sources+=$(TWOIND_FERMION_FILES) +endif + lib_LIBRARIES = libGrid.a CCFILES += $(extra_sources) diff --git a/Grid/communicator/SharedMemory.h b/Grid/communicator/SharedMemory.h index 6c6e3953..f2d20a24 100644 --- a/Grid/communicator/SharedMemory.h +++ b/Grid/communicator/SharedMemory.h @@ -102,7 +102,7 @@ public: /////////////////////////////////////////////////// static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryFree(void); - static void SharedMemoryCopy(void *dest,const void *src,size_t bytes); + static void SharedMemoryCopy(void *dest,void *src,size_t bytes); static void SharedMemoryZero(void *dest,size_t bytes); }; diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 6089093b..a12418e6 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -715,7 +715,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes) bzero(dest,bytes); #endif } -void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes) +void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) { #ifdef GRID_CUDA cudaMemcpy(dest,src,bytes,cudaMemcpyDefault); diff --git a/Grid/communicator/SharedMemoryNone.cc b/Grid/communicator/SharedMemoryNone.cc index ed37ab47..35663632 100644 --- a/Grid/communicator/SharedMemoryNone.cc +++ b/Grid/communicator/SharedMemoryNone.cc @@ -29,6 +29,7 @@ Author: Peter Boyle #include NAMESPACE_BEGIN(Grid); +#define header "SharedMemoryNone: " /*Construct from an MPI communicator*/ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) @@ -55,6 +56,38 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M //////////////////////////////////////////////////////////////////////////////////////////// // Hugetlbfs mapping intended, use anonymous mmap //////////////////////////////////////////////////////////////////////////////////////////// +#if 1 +void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) +{ + std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "< Bt(Nm * max_threads); thread_region @@ -164,7 +164,8 @@ void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,in auto basis_vp=& basis_v[0]; autoView(result_v,result,AcceleratorWrite); accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ - auto B=coalescedRead(zz); + vobj zzz=Zero(); + auto B=coalescedRead(zzz); for(int k=k0; k Make.inc echo >> Make.inc echo CCFILES=$CCFILES >> Make.inc - +echo ZWILS_FERMION_FILES=$ZWILS_FERMION_FILES >> Make.inc +echo WILS_FERMION_FILES=$WILS_FERMION_FILES >> Make.inc +echo STAG_FERMION_FILES=$STAG_FERMION_FILES >> Make.inc +echo GP_FERMION_FILES=$GP_FERMION_FILES >> Make.inc +echo ADJ_FERMION_FILES=$ADJ_FERMION_FILES >> Make.inc +echo TWOIND_FERMION_FILES=$TWOIND_FERMION_FILES >> Make.inc # tests Make.inc cd $home/tests From 2ef1fa66a8afe2066b8c1ef191a608bd64bdb3bd Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Mon, 7 Dec 2020 11:53:35 -0500 Subject: [PATCH 54/56] Improved performance of G-parity kernel for GPUs by simplifying multLink implementation --- Grid/qcd/action/fermion/GparityWilsonImpl.h | 42 ++++++++------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h index 0b726db9..9dca403b 100644 --- a/Grid/qcd/action/fermion/GparityWilsonImpl.h +++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h @@ -97,42 +97,30 @@ public: Coordinate icoor; #ifdef GRID_SIMT - _Spinor tmp; - const int Nsimd =SiteDoubledGaugeField::Nsimd(); int s = acceleratorSIMTlane(Nsimd); St.iCoorFromIindex(icoor,s); int mmu = mu % Nd; - if ( SE->_around_the_world && St.parameters.twists[mmu] ) { - - int permute_lane = (sl==1) - || ((distance== 1)&&(icoor[direction]==1)) - || ((distance==-1)&&(icoor[direction]==0)); - if ( permute_lane ) { - tmp(0) = chi(1); - tmp(1) = chi(0); - } else { - tmp(0) = chi(0); - tmp(1) = chi(1); - } + auto UU0=coalescedRead(U(0)(mu)); + auto UU1=coalescedRead(U(1)(mu)); + + //Decide whether we do a G-parity flavor twist + //Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir + //It also assumes (but does not check) that abs(distance) == 1 + int permute_lane = (sl==1) + || ((distance== 1)&&(icoor[direction]==1)) + || ((distance==-1)&&(icoor[direction]==0)); - auto UU0=coalescedRead(U(0)(mu)); - auto UU1=coalescedRead(U(1)(mu)); + permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world - mult(&phi(0),&UU0,&tmp(0)); - mult(&phi(1),&UU1,&tmp(1)); + //Apply the links + int f_upper = permute_lane ? 1 : 0; + int f_lower = !f_upper; - } else { - - auto UU0=coalescedRead(U(0)(mu)); - auto UU1=coalescedRead(U(1)(mu)); - - mult(&phi(0),&UU0,&chi(0)); - mult(&phi(1),&UU1,&chi(1)); - - } + mult(&phi(0),&UU0,&chi(f_upper)); + mult(&phi(1),&UU1,&chi(f_lower)); #else typedef _Spinor vobj; From 9aec4a3c2620dd459bac5b89fc0d661aaf50e6cd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 10 Dec 2020 02:11:17 -0800 Subject: [PATCH 55/56] SYCL --- Grid/simd/Grid_gpu_vec.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 8b17f75a..8e55ce2f 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -38,12 +38,20 @@ Author: Peter Boyle #ifdef GRID_HIP #include #endif +#ifdef GRID_SYCL +namespace Grid { + typedef struct { uint16_t x;} half; + typedef struct { half x; half y;} half2; + typedef struct { float x; float y;} float2; + typedef struct { double x; double y;} double2; +} +#endif + namespace Grid { -#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) -typedef struct { uint16_t x;} half; -#endif + + typedef struct Half2_t { half x; half y; } Half2; #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH ) @@ -156,7 +164,7 @@ accelerator_inline float half2float(half h) f = __half2float(h); #else Grid_half hh; - hh.x = hr.x; + hh.x = h.x; f= sfw_half_to_float(hh); #endif return f; From 873519e96046acfd0844a7d07d540d989a7a6204 Mon Sep 17 00:00:00 2001 From: Michael Marshall <43034299+mmphys@users.noreply.github.com> Date: Mon, 14 Dec 2020 16:06:10 +0000 Subject: [PATCH 56/56] Enable existing conserved current code for CUDA (compiles OK for CUDA 10.1). Add option to Test_cayley_mres to load a configuration --- .../implementation/CayleyFermion5DImplementation.h | 4 ++-- tests/debug/Test_cayley_mres.cc | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index b3fbe096..f11e9c44 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -642,7 +642,7 @@ void CayleyFermion5D::ContractConservedCurrent( PropagatorField &q_in_1, Current curr_type, unsigned int mu) { -#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) +#if (!defined(GRID_HIP)) Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, @@ -826,7 +826,7 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, } #endif -#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) +#if (!defined(GRID_HIP)) int tshift = (mu == Nd-1) ? 1 : 0; //////////////////////////////////////////////// // GENERAL CAYLEY CASE diff --git a/tests/debug/Test_cayley_mres.cc b/tests/debug/Test_cayley_mres.cc index 2e56fa81..5282c756 100644 --- a/tests/debug/Test_cayley_mres.cc +++ b/tests/debug/Test_cayley_mres.cc @@ -108,8 +108,18 @@ int main (int argc, char ** argv) GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); LatticeGaugeField Umu(UGrid); - SU::ColdConfiguration(Umu); - // SU::HotConfiguration(RNG4,Umu); + if( argc > 1 && argv[1][0] != '-' ) + { + std::cout<::ColdConfiguration(Umu); + // SU::HotConfiguration(RNG4,Umu); + } RealD mass=0.3; RealD M5 =1.0;