From f0dc0f36214c4dda1eace9d83956e5d7fef4f729 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sat, 22 Aug 2020 13:57:33 +0200 Subject: [PATCH 001/201] fix compile issue on Qpace3 --- Grid/lattice/Lattice_transfer.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index e698e40e..91de721f 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -127,6 +127,11 @@ accelerator_inline void convertType(T1 & out, const iScalar & in) { convertType(out,in._internal); } +template::value, T1>::type* = nullptr> +accelerator_inline void convertType(T1 & out, const iScalar & in) { + convertType(out,in._internal); +} + template accelerator_inline void convertType(iScalar & out, const T2 & in) { convertType(out._internal,in); From 1292d595634172e28158f99d31863d63267ca5ac Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Thu, 11 Jun 2020 13:16:00 +0200 Subject: [PATCH 002/201] Add a typedef + broaden interface of CMat --- Grid/algorithms/CoarsenedMatrix.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 8d184aea..76950baf 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -268,6 +268,21 @@ public: typedef iMatrix Cobj; typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice FineField; + typedef CoarseVector FermionField; + + // enrich interface + void Meooe(CoarseVector const& in, CoarseVector& out) { assert(0); } + void MeooeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } + void Mooee(CoarseVector const& in, CoarseVector& out) { assert(0); } + void MooeeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } + void MooeeInv(CoarseVector const& in, CoarseVector& out) { assert(0); } + void MooeeInvDag(CoarseVector const& in, CoarseVector& out) { assert(0); } + void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; } + void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; } + void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; } + void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; } + void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; }; + void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; }; //////////////////// // Data members From dd1ba266b269b093a8bcd9bf815438d9896d7148 Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Fri, 17 Jul 2020 11:58:02 +0200 Subject: [PATCH 003/201] Fix mapping between dir + disp and point in CMat --- Grid/algorithms/CoarsenedMatrix.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 76950baf..d18fba43 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -432,25 +432,25 @@ public: ////////////// // 4D action like wilson - // 0+ => 0 - // 0- => 1 - // 1+ => 2 - // 1- => 3 + // 0+ => 0 + // 0- => 4 + // 1+ => 1 + // 1- => 5 // etc.. ////////////// // 5D action like DWF - // 1+ => 0 - // 1- => 1 - // 2+ => 2 - // 2- => 3 + // 1+ => 0 + // 1- => 4 + // 2+ => 1 + // 2- => 5 // etc.. auto point = [dir, disp, ndim](){ if(dir == 0 and disp == 0) return 8; else if ( ndim==4 ) { - return (4 * dir + 1 - disp) / 2; + return (1 - disp) / 2 * 4 + dir; } else { - return (4 * (dir-1) + 1 - disp) / 2; + return (1 - disp) / 2 * 4 + dir - 1; } }(); From b2087f14c48e881ab041bbaef7f6c444c88a895d Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Mon, 24 Aug 2020 16:54:36 +0200 Subject: [PATCH 004/201] Fix CoarsenedMatrix regarding illegal memory accesses Need a reference to geom since the lambda copies the this pointer which points to host memory, see - https://docs.nvidia.com/cuda/cuda-c-programming-guide/#star-this-capture - https://devblogs.nvidia.com/new-compiler-features-cuda-8/ --- Grid/algorithms/CoarsenedMatrix.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index d18fba43..ba40535c 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -310,6 +310,8 @@ public: Stencil.HaloExchange(in,compressor); autoView( in_v , in, AcceleratorRead); autoView( out_v , out, AcceleratorWrite); + autoView( Stencil_v , Stencil, AcceleratorRead); + auto& geom_v = geom; typedef LatticeView Aview; Vector AcceleratorViewContainer; @@ -331,14 +333,14 @@ public: int ptype; StencilEntry *SE; - for(int point=0;point_is_local) { nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); } acceleratorSynchronise(); @@ -382,6 +384,7 @@ public: autoView( out_v , out, AcceleratorWrite); autoView( in_v , in, AcceleratorRead); + autoView( Stencil_v , Stencil, AcceleratorRead); const int Nsimd = CComplex::Nsimd(); typedef decltype(coalescedRead(in_v[0])) calcVector; @@ -395,12 +398,12 @@ public: int ptype; StencilEntry *SE; - SE=Stencil.GetEntry(ptype,point,ss); + SE=Stencil_v.GetEntry(ptype,point,ss); if(SE->_is_local) { nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); } acceleratorSynchronise(); From 2a75516330925bd05681f4dba639482ca84270d7 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 26 Aug 2020 12:34:17 -0400 Subject: [PATCH 005/201] state MPI/SLURM message only on world_rank zero --- Grid/threads/Accelerator.cc | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 864d90a9..f6df4a31 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -21,22 +21,26 @@ void acceleratorInit(void) #define ENV_RANK_SLURM "SLURM_PROCID" #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" #define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" - // We extract the local rank initialization using an environment variable - if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { - printf("OPENMPI detected\n"); - rank = atoi(localRankStr); - } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) { - printf("MVAPICH detected\n"); - rank = atoi(localRankStr); - } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) { - printf("SLURM detected\n"); - rank = atoi(localRankStr); - } else { - printf("MPI version is unknown - bad things may happen\n"); - } if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);} + // We extract the local rank initialization using an environment variable + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { + if (!world_rank) + printf("OPENMPI detected\n"); + rank = atoi(localRankStr); + } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) { + if (!world_rank) + printf("MVAPICH detected\n"); + rank = atoi(localRankStr); + } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) { + if (!world_rank) + printf("SLURM detected\n"); + rank = atoi(localRankStr); + } else { + if (!world_rank) + printf("MPI version is unknown - bad things may happen\n"); + } size_t totalDeviceMem=0; for (int i = 0; i < nDevices; i++) { From cf3535d16e412adf80ca1a2f7ead54003860c94b Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Thu, 27 Aug 2020 14:06:48 +0200 Subject: [PATCH 006/201] Expose more functions in CMat --- Grid/algorithms/CoarsenedMatrix.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index ba40535c..cdfc2ac3 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -271,6 +271,9 @@ public: typedef CoarseVector FermionField; // enrich interface + void Dhop(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } + void DhopEO(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } + void DhopOE(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } void Meooe(CoarseVector const& in, CoarseVector& out) { assert(0); } void MeooeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } void Mooee(CoarseVector const& in, CoarseVector& out) { assert(0); } From 4d2dc7ba0304397f536a2bc71260266e1475ae83 Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Mon, 7 Sep 2020 17:57:07 +0200 Subject: [PATCH 007/201] Enable even-odd for CoarsenedMatrix --- Grid/algorithms/CoarsenedMatrix.h | 471 +++++++++++++++++++++++--- tests/solver/Test_coarse_even_odd.cc | 475 +++++++++++++++++++++++++++ 2 files changed, 899 insertions(+), 47 deletions(-) create mode 100644 tests/solver/Test_coarse_even_odd.cc diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index cdfc2ac3..66b9c169 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -31,6 +31,7 @@ Author: paboyle #ifndef GRID_ALGORITHM_COARSENED_MATRIX_H #define GRID_ALGORITHM_COARSENED_MATRIX_H +#include // needed for Dagger(Yes|No), Inverse(Yes|No) NAMESPACE_BEGIN(Grid); @@ -59,12 +60,14 @@ inline void blockMaskedInnerProduct(Lattice &CoarseInner, class Geometry { public: int npoint; + int base; std::vector directions ; std::vector displacements; + std::vector points_dagger; Geometry(int _d) { - int base = (_d==5) ? 1:0; + base = (_d==5) ? 1:0; // make coarse grid stencil for 4d , not 5d if ( _d==5 ) _d=4; @@ -72,16 +75,51 @@ public: npoint = 2*_d+1; directions.resize(npoint); displacements.resize(npoint); + points_dagger.resize(npoint); for(int d=0;d<_d;d++){ directions[d ] = d+base; directions[d+_d] = d+base; displacements[d ] = +1; displacements[d+_d]= -1; + points_dagger[d ] = d+_d; + points_dagger[d+_d] = d; } directions [2*_d]=0; displacements[2*_d]=0; + points_dagger[2*_d]=2*_d; } + int point(int dir, int disp) { + assert(disp == -1 || disp == 0 || disp == 1); + assert(base+0 <= dir && dir < base+4); + + // directions faster index = new indexing + // 4d (base = 0): + // point 0 1 2 3 4 5 6 7 8 + // dir 0 1 2 3 0 1 2 3 0 + // disp +1 +1 +1 +1 -1 -1 -1 -1 0 + // 5d (base = 1): + // point 0 1 2 3 4 5 6 7 8 + // dir 1 2 3 4 1 2 3 4 0 + // disp +1 +1 +1 +1 -1 -1 -1 -1 0 + + // displacements faster index = old indexing + // 4d (base = 0): + // point 0 1 2 3 4 5 6 7 8 + // dir 0 0 1 1 2 2 3 3 0 + // disp +1 -1 +1 -1 +1 -1 +1 -1 0 + // 5d (base = 1): + // point 0 1 2 3 4 5 6 7 8 + // dir 1 1 2 2 3 3 4 4 0 + // disp +1 -1 +1 -1 +1 -1 +1 -1 0 + + if(dir == 0 and disp == 0) + return 8; + else // New indexing + return (1 - disp) / 2 * 4 + dir - base; + // else // Old indexing + // return (4 * (dir - base) + 1 - disp) / 2; + } }; template @@ -258,7 +296,7 @@ public: // Fine Object == (per site) type of fine field // nbasis == number of deflation vectors template -class CoarsenedMatrix : public SparseMatrixBase > > { +class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase > > { public: typedef iVector siteVector; @@ -270,16 +308,7 @@ public: typedef Lattice FineField; typedef CoarseVector FermionField; - // enrich interface - void Dhop(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } - void DhopEO(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } - void DhopOE(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } - void Meooe(CoarseVector const& in, CoarseVector& out) { assert(0); } - void MeooeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } - void Mooee(CoarseVector const& in, CoarseVector& out) { assert(0); } - void MooeeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } - void MooeeInv(CoarseVector const& in, CoarseVector& out) { assert(0); } - void MooeeInvDag(CoarseVector const& in, CoarseVector& out) { assert(0); } + // enrich interface, use default implementation as in FermionOperator /////// void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; } void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; } void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; } @@ -292,21 +321,36 @@ public: //////////////////// Geometry geom; GridBase * _grid; + GridBase* _cbgrid; int hermitian; CartesianStencil Stencil; + CartesianStencil StencilEven; + CartesianStencil StencilOdd; std::vector A; - + std::vector Aeven; + std::vector Aodd; + + CoarseMatrix AselfInv; + CoarseMatrix AselfInvEven; + CoarseMatrix AselfInvOdd; + + Vector dag_factor; + /////////////////////// // Interface /////////////////////// GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know + GridBase * RedBlackGrid() { return _cbgrid; }; + + int ConstEE() { return 0; } void M (const CoarseVector &in, CoarseVector &out) { conformable(_grid,in.Grid()); conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); SimpleCompressor compressor; @@ -364,12 +408,72 @@ public: return M(in,out); } else { // corresponds to Galerkin coarsening - CoarseVector tmp(Grid()); - G5C(tmp, in); - M(tmp, out); - G5C(out, out); + return MdagNonHermitian(in, out); } }; + + void MdagNonHermitian(const CoarseVector &in, CoarseVector &out) + { + conformable(_grid,in.Grid()); + conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); + + SimpleCompressor compressor; + + Stencil.HaloExchange(in,compressor); + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); + autoView( Stencil_v , Stencil, AcceleratorRead); + auto& geom_v = geom; + typedef LatticeView Aview; + + Vector AcceleratorViewContainer; + + for(int p=0;poSites(); + + Vector points(geom.npoint, 0); + for(int p=0; poSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb compressor; @@ -379,6 +483,7 @@ public: { conformable(_grid,in.Grid()); conformable(_grid,out.Grid()); + out.Checkerboard() = in.Checkerboard(); typedef LatticeView Aview; Vector AcceleratorViewContainer; @@ -434,34 +539,7 @@ public: this->MdirComms(in); - int ndim = in.Grid()->Nd(); - - ////////////// - // 4D action like wilson - // 0+ => 0 - // 0- => 4 - // 1+ => 1 - // 1- => 5 - // etc.. - ////////////// - // 5D action like DWF - // 1+ => 0 - // 1- => 4 - // 2+ => 1 - // 2- => 5 - // etc.. - auto point = [dir, disp, ndim](){ - if(dir == 0 and disp == 0) - return 8; - else if ( ndim==4 ) { - return (1 - disp) / 2 * 4 + dir; - } else { - return (1 - disp) / 2 * 4 + dir - 1; - } - }(); - - MdirCalc(in,out,point); - + MdirCalc(in,out,geom.point(dir,disp)); }; void Mdiag(const CoarseVector &in, CoarseVector &out) @@ -470,17 +548,269 @@ public: MdirCalc(in, out, point); // No comms }; + void Mooee(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerNo, InverseNo); + } + + void MooeeInv(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerNo, InverseYes); + } + + void MooeeDag(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerYes, InverseNo); + } + + void MooeeInvDag(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerYes, InverseYes); + } + + void Meooe(const CoarseVector &in, CoarseVector &out) { + if(in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerNo); + } else { + DhopOE(in, out, DaggerNo); + } + } + + void MeooeDag(const CoarseVector &in, CoarseVector &out) { + if(in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerYes); + } else { + DhopOE(in, out, DaggerYes); + } + } + + void Dhop(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _grid); // verifies full grid + conformable(in.Grid(), out.Grid()); + + out.Checkerboard() = in.Checkerboard(); + + DhopInternal(Stencil, A, in, out, dag); + } + + void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Even); + out.Checkerboard() = Odd; + + DhopInternal(StencilEven, Aodd, in, out, dag); + } + + void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Odd); + out.Checkerboard() = Even; + + DhopInternal(StencilOdd, Aeven, in, out, dag); + } + + void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) { + out.Checkerboard() = in.Checkerboard(); + assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); + + CoarseMatrix *Aself = nullptr; + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1]; + DselfInternal(StencilOdd, *Aself, in, out, dag); + } else { + Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1]; + DselfInternal(StencilEven, *Aself, in, out, dag); + } + } else { + Aself = (inv) ? &AselfInv : &A[geom.npoint-1]; + DselfInternal(Stencil, *Aself, in, out, dag); + } + assert(Aself != nullptr); + } + + void DselfInternal(CartesianStencil &st, CoarseMatrix &a, + const CoarseVector &in, CoarseVector &out, int dag) { + int point = geom.npoint-1; + autoView( out_v, out, AcceleratorWrite); + autoView( in_v, in, AcceleratorRead); + autoView( st_v, st, AcceleratorRead); + autoView( a_v, a, AcceleratorRead); + + const int Nsimd = CComplex::Nsimd(); + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + + RealD* dag_factor_p = &dag_factor[0]; + + if(dag) { + accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + SE=st_v.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bboSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + SE=st_v.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb &st, std::vector &a, + const CoarseVector &in, CoarseVector &out, int dag) { + SimpleCompressor compressor; + + st.HaloExchange(in,compressor); + autoView( in_v, in, AcceleratorRead); + autoView( out_v, out, AcceleratorWrite); + autoView( st_v , st, AcceleratorRead); + typedef LatticeView Aview; + + // determine in what order we need the points + int npoint = geom.npoint-1; + Vector points(npoint, 0); + for(int p=0; p AcceleratorViewContainer; + for(int p=0;poSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bboSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb > &linop, Aggregation & Subspace) { @@ -629,6 +959,9 @@ public: std::cout << GridLogMessage << " ForceHermitian, new code "<lSites(); + + typedef typename Cobj::scalar_object scalar_object; + + autoView(Aself_v, A[geom.npoint-1], CpuRead); + autoView(AselfInv_v, AselfInv, CpuWrite); + thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke + Eigen::MatrixXcd selfLinkEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis); + Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis); + + scalar_object selfLink = Zero(); + scalar_object selfLinkInv = Zero(); + + Coordinate lcoor; + + Grid()->LocalIndexToLocalCoor(site, lcoor); + peekLocalSite(selfLink, Aself_v, lcoor); + + for (int i = 0; i < nbasis; ++i) + for (int j = 0; j < nbasis; ++j) + selfLinkEigen(i, j) = static_cast(TensorRemove(selfLink(i, j))); + + selfLinkInvEigen = selfLinkEigen.inverse(); + + for(int i = 0; i < nbasis; ++i) + for(int j = 0; j < nbasis; ++j) + selfLinkInv(i, j) = selfLinkInvEigen(i, j); + + pokeLocalSite(selfLinkInv, AselfInv_v, lcoor); + }); + } + + void FillHalfCbs() { + std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl; + for(int p = 0; p < geom.npoint; ++p) { + pickCheckerboard(Even, Aeven[p], A[p]); + pickCheckerboard(Odd, Aodd[p], A[p]); + } + pickCheckerboard(Even, AselfInvEven, AselfInv); + pickCheckerboard(Odd, AselfInvOdd, AselfInv); + } }; NAMESPACE_END(Grid); diff --git a/tests/solver/Test_coarse_even_odd.cc b/tests/solver/Test_coarse_even_odd.cc new file mode 100644 index 00000000..dfbab747 --- /dev/null +++ b/tests/solver/Test_coarse_even_odd.cc @@ -0,0 +1,475 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/solver/Test_coarse_even_odd.cc + + Copyright (C) 2015-2020 + + Author: Daniel Richtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + +#include + +using namespace Grid; + +#ifndef NBASIS +#define NBASIS 40 +#endif + +// NOTE: The tests in this file are written in analogy to +// - tests/core/Test_wilson_even_odd.cc +// - tests/core/Test_wilson_clover.cc + +std::vector readFromCommandlineIvec(int* argc, + char*** argv, + std::string&& option, + const std::vector& defaultValue) { + std::string arg; + std::vector ret(defaultValue); + if(GridCmdOptionExists(*argv, *argv + *argc, option)) { + arg = GridCmdOptionPayload(*argv, *argv + *argc, option); + GridCmdOptionIntVector(arg, ret); + } + return ret; +} + +int main(int argc, char** argv) { + Grid_init(&argc, &argv); + + ///////////////////////////////////////////////////////////////////////////// + // Read from command line // + ///////////////////////////////////////////////////////////////////////////// + + const int nbasis = NBASIS; static_assert((nbasis & 0x1) == 0, ""); + const int nb = nbasis/2; + Coordinate blockSize = readFromCommandlineIvec(&argc, &argv, "--blocksize", {2, 2, 2, 2}); + + std::cout << GridLogMessage << "Compiled with nbasis = " << nbasis << " -> nb = " << nb << std::endl; + + ///////////////////////////////////////////////////////////////////////////// + // General setup // + ///////////////////////////////////////////////////////////////////////////// + + Coordinate clatt = GridDefaultLatt(); + for(int d=0; dshow_decomposition(); + std::cout << GridLogMessage << "Grid_c:" << std::endl; Grid_c->show_decomposition(); + std::cout << GridLogMessage << "RBGrid_f:" << std::endl; RBGrid_f->show_decomposition(); + std::cout << GridLogMessage << "RBGrid_c:" << std::endl; RBGrid_c->show_decomposition(); + + GridParallelRNG pRNG_f(Grid_f); + GridParallelRNG pRNG_c(Grid_c); + + std::vector seeds({1, 2, 3, 4}); + + pRNG_f.SeedFixedIntegers(seeds); + pRNG_c.SeedFixedIntegers(seeds); + + ///////////////////////////////////////////////////////////////////////////// + // Setup of Dirac Matrix and Operator // + ///////////////////////////////////////////////////////////////////////////// + + LatticeGaugeField Umu(Grid_f); SU3::HotConfiguration(pRNG_f, Umu); + + RealD checkTolerance = (getPrecision::value == 1) ? 1e-7 : 1e-15; + + RealD mass = -0.30; + RealD csw = 1.9192; + + WilsonCloverFermionR Dwc(Umu, *Grid_f, *RBGrid_f, mass, csw, csw); + MdagMLinearOperator MdagMOp_Dwc(Dwc); + + ///////////////////////////////////////////////////////////////////////////// + // Type definitions // + ///////////////////////////////////////////////////////////////////////////// + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseDiracMatrix; + typedef CoarseDiracMatrix::CoarseVector CoarseVector; + + ///////////////////////////////////////////////////////////////////////////// + // Setup of Aggregation // + ///////////////////////////////////////////////////////////////////////////// + + Aggregates Aggs(Grid_c, Grid_f, 0); + { + LatticeFermion tmp(Aggs.subspace[0].Grid()); + for(int n = 0; n < nb; n++) { + gaussian(pRNG_f, Aggs.subspace[n]); + G5C(tmp, Aggs.subspace[n]); + axpby(Aggs.subspace[n + nb], 0.5, -0.5, Aggs.subspace[n], tmp); + axpby(Aggs.subspace[n], 0.5, 0.5, Aggs.subspace[n], tmp); + } + } + + ///////////////////////////////////////////////////////////////////////////// + // Setup of CoarsenedMatrix and Operator // + ///////////////////////////////////////////////////////////////////////////// + + const int hermitian = 0; + CoarseDiracMatrix Dc(*Grid_c, *RBGrid_c, hermitian); + Dc.CoarsenOperator(Grid_f, MdagMOp_Dwc, Aggs); + MdagMLinearOperator MdagMOp_Dc(Dc); + + ///////////////////////////////////////////////////////////////////////////// + // Setup vectors used in all tests // + ///////////////////////////////////////////////////////////////////////////// + + CoarseVector src(Grid_c); random(pRNG_c, src); + CoarseVector diff(Grid_c); diff = Zero(); + + ///////////////////////////////////////////////////////////////////////////// + // Start of tests // + ///////////////////////////////////////////////////////////////////////////// + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Dhop + Mdiag = Munprec" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi(Grid_c); chi = Zero(); + CoarseVector res(Grid_c); res = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + + Dc.Mdiag(src, phi); std::cout << GridLogMessage << "Applied Mdiag" << std::endl; + Dc.Dhop(src, chi, DaggerNo); std::cout << GridLogMessage << "Applied Dhop" << std::endl; + Dc.M(src, ref); std::cout << GridLogMessage << "Applied M" << std::endl; + + res = phi + chi; + + diff = ref - res; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Munprec), norm2(Dhop + Mdiag), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(res) << " " << absDev << " " << relDev << " -> check " + << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Meo + Moe = Dhop" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector src_e(RBGrid_c); src_e = Zero(); + CoarseVector src_o(RBGrid_c); src_o = Zero(); + CoarseVector res_e(RBGrid_c); res_e = Zero(); + CoarseVector res_o(RBGrid_c); res_o = Zero(); + CoarseVector res(Grid_c); res = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd, src_o, src); + + Dc.Meooe(src_e, res_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + Dc.Meooe(src_o, res_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.Dhop(src, ref, DaggerNo); std::cout << GridLogMessage << "Applied Dhop" << std::endl; + + setCheckerboard(res, res_o); + setCheckerboard(res, res_e); + + diff = ref - res; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Dhop), norm2(Meo + Moe), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(res) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test |(Im(v^dag M^dag M v)| = 0" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.M(src, tmp); std::cout << GridLogMessage << "Applied M" << std::endl; + Dc.Mdag(tmp, phi); std::cout << GridLogMessage << "Applied Mdag" << std::endl; + + std::cout << GridLogMessage << "src = " << norm2(src) << " tmp = " << norm2(tmp) << " phi = " << norm2(phi) << std::endl; + + ComplexD dot = innerProduct(src, phi); + + auto relDev = abs(imag(dot)) / abs(real(dot)); + std::cout << GridLogMessage << "Re(v^dag M^dag M v), Im(v^dag M^dag M v), rel.deviation: " + << real(dot) << " " << imag(dot) << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test |(Im(v^dag Mooee^dag Mooee v)| = 0 (full grid)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.Mooee(src, tmp); std::cout << GridLogMessage << "Applied Mooee" << std::endl; + Dc.MooeeDag(tmp, phi); std::cout << GridLogMessage << "Applied MooeeDag" << std::endl; + + ComplexD dot = innerProduct(src, phi); + + auto relDev = abs(imag(dot)) / abs(real(dot)); + std::cout << GridLogMessage << "Re(v^dag Mooee^dag Mooee v), Im(v^dag Mooee^dag Mooee v), rel.deviation: " + << real(dot) << " " << imag(dot) << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeInv Mooee = 1 (full grid)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.Mooee(src, tmp); std::cout << GridLogMessage << "Applied Mooee" << std::endl; + Dc.MooeeInv(tmp, phi); std::cout << GridLogMessage << "Applied MooeeInv" << std::endl; + + diff = src - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(src); + std::cout << GridLogMessage << "norm2(src), norm2(MooeeInv Mooee src), abs. deviation, rel. deviation: " + << norm2(src) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MeooeDagger is the dagger of Meooe by requiring" << std::endl; + std::cout << GridLogMessage << "= < phi | Meooe | chi > * = < chi | Meooe^dag| phi>" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + // clang-format off + CoarseVector phi(Grid_c); random(pRNG_c, phi); + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector dchi_e(RBGrid_c); dchi_e = Zero(); + CoarseVector dchi_o(RBGrid_c); dchi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector dphi_e(RBGrid_c); dphi_e = Zero(); + CoarseVector dphi_o(RBGrid_c); dphi_o = Zero(); + // clang-format on + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + + Dc.Meooe(chi_e, dchi_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + Dc.Meooe(chi_o, dchi_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.MeooeDag(phi_e, dphi_o); std::cout << GridLogMessage << "Applied MeoDag" << std::endl; + Dc.MeooeDag(phi_o, dphi_e); std::cout << GridLogMessage << "Applied MoeDag" << std::endl; + + ComplexD phiDchi_e = innerProduct(phi_e, dchi_e); + ComplexD phiDchi_o = innerProduct(phi_o, dchi_o); + ComplexD chiDphi_e = innerProduct(chi_e, dphi_e); + ComplexD chiDphi_o = innerProduct(chi_o, dphi_o); + + std::cout << GridLogDebug << "norm dchi_e = " << norm2(dchi_e) << " norm dchi_o = " << norm2(dchi_o) << " norm dphi_e = " << norm2(dphi_e) + << " norm dphi_o = " << norm2(dphi_e) << std::endl; + + std::cout << GridLogMessage << "e " << phiDchi_e << " " << chiDphi_e << std::endl; + std::cout << GridLogMessage << "o " << phiDchi_o << " " << chiDphi_o << std::endl; + + std::cout << GridLogMessage << "phiDchi_e - conj(chiDphi_o) " << phiDchi_e - conj(chiDphi_o) << std::endl; + std::cout << GridLogMessage << "phiDchi_o - conj(chiDphi_e) " << phiDchi_o - conj(chiDphi_e) << std::endl; + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeInv Mooee = 1 (checkerboards separately)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector tmp_e(RBGrid_c); tmp_e = Zero(); + CoarseVector tmp_o(RBGrid_c); tmp_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, tmp_e, tmp); + pickCheckerboard(Odd, tmp_o, tmp); + + Dc.Mooee(chi_e, tmp_e); std::cout << GridLogMessage << "Applied Mee" << std::endl; + Dc.MooeeInv(tmp_e, phi_e); std::cout << GridLogMessage << "Applied MeeInv" << std::endl; + Dc.Mooee(chi_o, tmp_o); std::cout << GridLogMessage << "Applied Moo" << std::endl; + Dc.MooeeInv(tmp_o, phi_o); std::cout << GridLogMessage << "Applied MooInv" << std::endl; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + diff = chi - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(chi); + std::cout << GridLogMessage << "norm2(chi), norm2(MeeInv Mee chi), abs. deviation, rel. deviation: " + << norm2(chi) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeDag MooeeInvDag = 1 (checkerboards separately)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector tmp_e(RBGrid_c); tmp_e = Zero(); + CoarseVector tmp_o(RBGrid_c); tmp_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, tmp_e, tmp); + pickCheckerboard(Odd, tmp_o, tmp); + + Dc.MooeeDag(chi_e, tmp_e); std::cout << GridLogMessage << "Applied MeeDag" << std::endl; + Dc.MooeeInvDag(tmp_e, phi_e); std::cout << GridLogMessage << "Applied MeeInvDag" << std::endl; + Dc.MooeeDag(chi_o, tmp_o); std::cout << GridLogMessage << "Applied MooDag" << std::endl; + Dc.MooeeInvDag(tmp_o, phi_o); std::cout << GridLogMessage << "Applied MooInvDag" << std::endl; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + diff = chi - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(chi); + std::cout << GridLogMessage << "norm2(chi), norm2(MeeDag MeeInvDag chi), abs. deviation, rel. deviation: " + << norm2(chi) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Meo + Moe + Moo + Mee = Munprec" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); chi = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + CoarseVector src_e(RBGrid_c); src_e = Zero(); + CoarseVector src_o(RBGrid_c); src_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd, src_o, src); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + + // M phi = (Mooee src_e + Meooe src_o , Mooee src_o + Meooe src_e) + + Dc.M(src, ref); // Reference result from the unpreconditioned operator + + // EO matrix + Dc.Mooee(src_e, chi_e); std::cout << GridLogMessage << "Applied Mee" << std::endl; + Dc.Mooee(src_o, chi_o); std::cout << GridLogMessage << "Applied Moo" << std::endl; + Dc.Meooe(src_o, phi_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.Meooe(src_e, phi_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + + phi_o += chi_o; + phi_e += chi_e; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + std::cout << GridLogDebug << "norm phi_e = " << norm2(phi_e) << " norm phi_o = " << norm2(phi_o) << " norm phi = " << norm2(phi) << std::endl; + + diff = ref - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Dunprec), norm2(Deoprec), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MpcDagMpc is hermitian" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector phi(Grid_c); random(pRNG_c, phi); + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector dchi_e(RBGrid_c); dchi_e = Zero(); + CoarseVector dchi_o(RBGrid_c); dchi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector dphi_e(RBGrid_c); dphi_e = Zero(); + CoarseVector dphi_o(RBGrid_c); dphi_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + + SchurDiagMooeeOperator HermOpEO(Dc); + + HermOpEO.MpcDagMpc(chi_e, dchi_e); std::cout << GridLogMessage << "Applied MpcDagMpc to chi_e" << std::endl; + HermOpEO.MpcDagMpc(chi_o, dchi_o); std::cout << GridLogMessage << "Applied MpcDagMpc to chi_o" << std::endl; + HermOpEO.MpcDagMpc(phi_e, dphi_e); std::cout << GridLogMessage << "Applied MpcDagMpc to phi_e" << std::endl; + HermOpEO.MpcDagMpc(phi_o, dphi_o); std::cout << GridLogMessage << "Applied MpcDagMpc to phi_o" << std::endl; + + ComplexD phiDchi_e = innerProduct(phi_e, dchi_e); + ComplexD phiDchi_o = innerProduct(phi_o, dchi_o); + ComplexD chiDphi_e = innerProduct(chi_e, dphi_e); + ComplexD chiDphi_o = innerProduct(chi_o, dphi_o); + + std::cout << GridLogMessage << "e " << phiDchi_e << " " << chiDphi_e << std::endl; + std::cout << GridLogMessage << "o " << phiDchi_o << " " << chiDphi_o << std::endl; + + std::cout << GridLogMessage << "phiDchi_e - conj(chiDphi_e) " << phiDchi_e - conj(chiDphi_e) << std::endl; + std::cout << GridLogMessage << "phiDchi_o - conj(chiDphi_o) " << phiDchi_o - conj(chiDphi_o) << std::endl; + } + + Grid_finalize(); +} From 01652d8cfea1ad463b037b20aabfc4f3b0dc7d37 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 13 Sep 2020 05:56:02 -0400 Subject: [PATCH 008/201] SlabAllocator --- Grid/threads/Accelerator.h | 13 +-- Grid/threads/SlabAllocator.cc | 161 ++++++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+), 11 deletions(-) create mode 100644 Grid/threads/SlabAllocator.cc diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 1a3dfdc2..89d45a17 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -153,18 +153,7 @@ inline void *acceleratorAllocShared(size_t bytes) } return ptr; }; -inline void *acceleratorAllocDevice(size_t bytes) -{ - void *ptr=NULL; - auto err = cudaMalloc((void **)&ptr,bytes); - if( err != cudaSuccess ) { - ptr = (void *) NULL; - printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); - } - return ptr; -}; inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; -inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} inline int acceleratorIsCommunicable(void *ptr) @@ -176,6 +165,8 @@ inline int acceleratorIsCommunicable(void *ptr) if(uvm) return 0; else return 1; } +void *acceleratorAllocDevice(size_t bytes); +void acceleratorFreeDevice(void* ptr); #endif diff --git a/Grid/threads/SlabAllocator.cc b/Grid/threads/SlabAllocator.cc new file mode 100644 index 00000000..da863687 --- /dev/null +++ b/Grid/threads/SlabAllocator.cc @@ -0,0 +1,161 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./Grid/threads/SlabAllocator.cc + + Copyright (C) 2020 + +Author: Christoph Lehner + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#include + +#include + +NAMESPACE_BEGIN(Grid); + +#ifdef GRID_CUDA + +#define GRID_DEVICE_HEAP_SLAB_THRESHOLD (1024*1024) +#define GRID_DEVICE_HEAP_SLAB_SIZE (2*1024*1024) + +void *acceleratorAllocDeviceCUDA(size_t bytes) { + void *ptr=NULL; + auto err = cudaMalloc((void **)&ptr,bytes); + if( err != cudaSuccess ) { + ptr = (void *) NULL; + printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); + } + return ptr; +} + +void acceleratorFreeDeviceCUDA(void *ptr) { + cudaFree(ptr); +} + +struct grid_device_heap_slab_t { + void* Ptr; + size_t ElementSize; + size_t Elements; + std::unordered_set Allocated; + std::unordered_set Available; +}; + +std::unordered_map DeviceHeapPtrTable; +std::unordered_map > DeviceHeapSlabTable; + +void* SlabAllocateElement(grid_device_heap_slab_t* slab) { + assert(!slab->Available.empty()); + auto available = slab->Available.begin(); + auto slot = *available; + slab->Allocated.insert(slot); + slab->Available.erase(available); + + void* Ptr = (void*)((char*)slab->Ptr + slot * slab->ElementSize); + DeviceHeapPtrTable[Ptr] = slab; + + //std::cout << "Allocate element " << slot << " of slab " << slab << " of size " << slab->ElementSize << " with elements " << slab->Elements << + // " (allocated = " << slab->Allocated.size() << ", available = " << slab->Available.size() << ")" << std::endl; + + return Ptr; +} + +void SlabRemove(grid_device_heap_slab_t* slab) { + auto & t = DeviceHeapSlabTable[slab->ElementSize]; + assert(slab->Ptr); + DeviceHeapPtrTable.erase(slab->Ptr); + acceleratorFreeDeviceCUDA(slab->Ptr); + assert(t.count(slab) == 1); + t.erase(slab); + delete slab; + //std::cout << "Remove slab " << slab << std::endl; +} + +void SlabFreeElement(grid_device_heap_slab_t* slab, void* ElementPtr) { + size_t Offset = (size_t)ElementPtr - (size_t)slab->Ptr; + //std::cout << "SlabFreeElement offset " << Offset << std::endl; + assert(Offset < GRID_DEVICE_HEAP_SLAB_SIZE); + assert(Offset % slab->ElementSize == 0); + size_t slot = Offset / slab->ElementSize; + assert(slot >= 0); + assert(slab->Allocated.count(slot) == 1 && slab->Available.count(slot) == 0); + slab->Allocated.erase(slot); + slab->Available.insert(slot); + + //std::cout << "Free element " << slot << " of slab" << slab << std::endl; + + if (slab->Allocated.empty()) { + SlabRemove(slab); + } +} + +grid_device_heap_slab_t* SlabFind(size_t bytes) { + + grid_device_heap_slab_t* slab = 0; + std::unordered_set* slab_set = 0; + + decltype(DeviceHeapSlabTable.begin()) slabs = DeviceHeapSlabTable.find(bytes); + if (slabs == DeviceHeapSlabTable.end()) { + slab_set = &DeviceHeapSlabTable[bytes]; + } else { + slab_set = &slabs->second; + } + + for (auto& s : *slab_set) { + if (!s->Available.empty()) { + slab = &(*s); + break; + } + } + + if (!slab) { + slab = new grid_device_heap_slab_t; + slab_set->insert(slab); + slab->Ptr = acceleratorAllocDeviceCUDA(GRID_DEVICE_HEAP_SLAB_SIZE); + slab->ElementSize = bytes; + slab->Elements = GRID_DEVICE_HEAP_SLAB_SIZE / bytes; + for (size_t i=0;iElements;i++) + slab->Available.insert(i); + //std::cout << "New slab" << slab << std::endl; + } + + return slab; +} + +void *acceleratorAllocDevice(size_t bytes) { + if (bytes >= GRID_DEVICE_HEAP_SLAB_THRESHOLD) { + return acceleratorAllocDeviceCUDA(bytes); + } + + return SlabAllocateElement(SlabFind(bytes)); +} + +void acceleratorFreeDevice(void *ptr) { + auto p = DeviceHeapPtrTable.find(ptr); + if (p == DeviceHeapPtrTable.end()) { + acceleratorFreeDeviceCUDA(ptr); + } else { + SlabFreeElement(p->second,ptr); + } +} + +#endif + +NAMESPACE_END(Grid); From 32ff766dbd8416b45ae042463c8f65145b75806f Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 13 Sep 2020 14:02:53 -0400 Subject: [PATCH 009/201] fix evict scheme, slab alloc --- Grid/allocator/MemoryManagerCache.cc | 6 ++++-- Grid/threads/SlabAllocator.cc | 8 ++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 5dd7575e..7d4581d7 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -227,12 +227,13 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,hint); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; + if (!AccCache.AccPtr) + EvictVictims(bytes); assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); @@ -361,12 +362,13 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,transient); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; + if (!AccCache.AccPtr) + EvictVictims(bytes); assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache.accLock==0); // Programming error diff --git a/Grid/threads/SlabAllocator.cc b/Grid/threads/SlabAllocator.cc index da863687..5590f835 100644 --- a/Grid/threads/SlabAllocator.cc +++ b/Grid/threads/SlabAllocator.cc @@ -36,6 +36,9 @@ NAMESPACE_BEGIN(Grid); #define GRID_DEVICE_HEAP_SLAB_THRESHOLD (1024*1024) #define GRID_DEVICE_HEAP_SLAB_SIZE (2*1024*1024) +size_t currentDeviceAlloc = 0; +std::unordered_map ptr_size; + void *acceleratorAllocDeviceCUDA(size_t bytes) { void *ptr=NULL; auto err = cudaMalloc((void **)&ptr,bytes); @@ -43,11 +46,16 @@ void *acceleratorAllocDeviceCUDA(size_t bytes) { ptr = (void *) NULL; printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); } + currentDeviceAlloc += bytes; + ptr_size[ptr] = bytes; + std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl; return ptr; } void acceleratorFreeDeviceCUDA(void *ptr) { cudaFree(ptr); + currentDeviceAlloc -= ptr_size[ptr]; + std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl; } struct grid_device_heap_slab_t { From d50a2164d73d2c01048cd1f25f765a720472b2c6 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 13 Sep 2020 14:06:06 -0400 Subject: [PATCH 010/201] remove slab allocator --- Grid/threads/Accelerator.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 89d45a17..1a3dfdc2 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -153,7 +153,18 @@ inline void *acceleratorAllocShared(size_t bytes) } return ptr; }; +inline void *acceleratorAllocDevice(size_t bytes) +{ + void *ptr=NULL; + auto err = cudaMalloc((void **)&ptr,bytes); + if( err != cudaSuccess ) { + ptr = (void *) NULL; + printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); + } + return ptr; +}; inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; +inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} inline int acceleratorIsCommunicable(void *ptr) @@ -165,8 +176,6 @@ inline int acceleratorIsCommunicable(void *ptr) if(uvm) return 0; else return 1; } -void *acceleratorAllocDevice(size_t bytes); -void acceleratorFreeDevice(void* ptr); #endif From 5cffa05c7e2965216200e7fff3183fce3f15c8bb Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 13 Sep 2020 14:06:25 -0400 Subject: [PATCH 011/201] remove slab allocator file --- Grid/threads/SlabAllocator.cc | 169 ---------------------------------- 1 file changed, 169 deletions(-) delete mode 100644 Grid/threads/SlabAllocator.cc diff --git a/Grid/threads/SlabAllocator.cc b/Grid/threads/SlabAllocator.cc deleted file mode 100644 index 5590f835..00000000 --- a/Grid/threads/SlabAllocator.cc +++ /dev/null @@ -1,169 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./Grid/threads/SlabAllocator.cc - - Copyright (C) 2020 - -Author: Christoph Lehner - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#include - -#include - -NAMESPACE_BEGIN(Grid); - -#ifdef GRID_CUDA - -#define GRID_DEVICE_HEAP_SLAB_THRESHOLD (1024*1024) -#define GRID_DEVICE_HEAP_SLAB_SIZE (2*1024*1024) - -size_t currentDeviceAlloc = 0; -std::unordered_map ptr_size; - -void *acceleratorAllocDeviceCUDA(size_t bytes) { - void *ptr=NULL; - auto err = cudaMalloc((void **)&ptr,bytes); - if( err != cudaSuccess ) { - ptr = (void *) NULL; - printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); - } - currentDeviceAlloc += bytes; - ptr_size[ptr] = bytes; - std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl; - return ptr; -} - -void acceleratorFreeDeviceCUDA(void *ptr) { - cudaFree(ptr); - currentDeviceAlloc -= ptr_size[ptr]; - std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl; -} - -struct grid_device_heap_slab_t { - void* Ptr; - size_t ElementSize; - size_t Elements; - std::unordered_set Allocated; - std::unordered_set Available; -}; - -std::unordered_map DeviceHeapPtrTable; -std::unordered_map > DeviceHeapSlabTable; - -void* SlabAllocateElement(grid_device_heap_slab_t* slab) { - assert(!slab->Available.empty()); - auto available = slab->Available.begin(); - auto slot = *available; - slab->Allocated.insert(slot); - slab->Available.erase(available); - - void* Ptr = (void*)((char*)slab->Ptr + slot * slab->ElementSize); - DeviceHeapPtrTable[Ptr] = slab; - - //std::cout << "Allocate element " << slot << " of slab " << slab << " of size " << slab->ElementSize << " with elements " << slab->Elements << - // " (allocated = " << slab->Allocated.size() << ", available = " << slab->Available.size() << ")" << std::endl; - - return Ptr; -} - -void SlabRemove(grid_device_heap_slab_t* slab) { - auto & t = DeviceHeapSlabTable[slab->ElementSize]; - assert(slab->Ptr); - DeviceHeapPtrTable.erase(slab->Ptr); - acceleratorFreeDeviceCUDA(slab->Ptr); - assert(t.count(slab) == 1); - t.erase(slab); - delete slab; - //std::cout << "Remove slab " << slab << std::endl; -} - -void SlabFreeElement(grid_device_heap_slab_t* slab, void* ElementPtr) { - size_t Offset = (size_t)ElementPtr - (size_t)slab->Ptr; - //std::cout << "SlabFreeElement offset " << Offset << std::endl; - assert(Offset < GRID_DEVICE_HEAP_SLAB_SIZE); - assert(Offset % slab->ElementSize == 0); - size_t slot = Offset / slab->ElementSize; - assert(slot >= 0); - assert(slab->Allocated.count(slot) == 1 && slab->Available.count(slot) == 0); - slab->Allocated.erase(slot); - slab->Available.insert(slot); - - //std::cout << "Free element " << slot << " of slab" << slab << std::endl; - - if (slab->Allocated.empty()) { - SlabRemove(slab); - } -} - -grid_device_heap_slab_t* SlabFind(size_t bytes) { - - grid_device_heap_slab_t* slab = 0; - std::unordered_set* slab_set = 0; - - decltype(DeviceHeapSlabTable.begin()) slabs = DeviceHeapSlabTable.find(bytes); - if (slabs == DeviceHeapSlabTable.end()) { - slab_set = &DeviceHeapSlabTable[bytes]; - } else { - slab_set = &slabs->second; - } - - for (auto& s : *slab_set) { - if (!s->Available.empty()) { - slab = &(*s); - break; - } - } - - if (!slab) { - slab = new grid_device_heap_slab_t; - slab_set->insert(slab); - slab->Ptr = acceleratorAllocDeviceCUDA(GRID_DEVICE_HEAP_SLAB_SIZE); - slab->ElementSize = bytes; - slab->Elements = GRID_DEVICE_HEAP_SLAB_SIZE / bytes; - for (size_t i=0;iElements;i++) - slab->Available.insert(i); - //std::cout << "New slab" << slab << std::endl; - } - - return slab; -} - -void *acceleratorAllocDevice(size_t bytes) { - if (bytes >= GRID_DEVICE_HEAP_SLAB_THRESHOLD) { - return acceleratorAllocDeviceCUDA(bytes); - } - - return SlabAllocateElement(SlabFind(bytes)); -} - -void acceleratorFreeDevice(void *ptr) { - auto p = DeviceHeapPtrTable.find(ptr); - if (p == DeviceHeapPtrTable.end()) { - acceleratorFreeDeviceCUDA(ptr); - } else { - SlabFreeElement(p->second,ptr); - } -} - -#endif - -NAMESPACE_END(Grid); From 97db2b8d208cd2b59a28eb8b4da440350ff1ba2c Mon Sep 17 00:00:00 2001 From: KANAMORI Issaku Date: Tue, 6 Oct 2020 17:25:59 +0900 Subject: [PATCH 012/201] add reordring of random number generator in IO --- Grid/parallelIO/BinaryIO.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/Grid/parallelIO/BinaryIO.h b/Grid/parallelIO/BinaryIO.h index 1f11add9..5df43f53 100644 --- a/Grid/parallelIO/BinaryIO.h +++ b/Grid/parallelIO/BinaryIO.h @@ -663,10 +663,15 @@ class BinaryIO { nersc_csum,scidac_csuma,scidac_csumb); timer.Start(); - thread_for(lidx,lsites,{ + thread_for(lidx,lsites,{ // FIX ME, suboptimal implementation std::vector tmp(RngStateCount); std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); - parallel_rng.SetState(tmp,lidx); + Coordinate lcoor; + grid->LocalIndexToLocalCoor(lidx, lcoor); + int o_idx=grid->oIndex(lcoor); + int i_idx=grid->iIndex(lcoor); + int gidx=parallel_rng.generator_idx(o_idx,i_idx); + parallel_rng.SetState(tmp,gidx); }); timer.Stop(); @@ -723,7 +728,12 @@ class BinaryIO { std::vector iodata(lsites); thread_for(lidx,lsites,{ std::vector tmp(RngStateCount); - parallel_rng.GetState(tmp,lidx); + Coordinate lcoor; + grid->LocalIndexToLocalCoor(lidx, lcoor); + int o_idx=grid->oIndex(lcoor); + int i_idx=grid->iIndex(lcoor); + int gidx=parallel_rng.generator_idx(o_idx,i_idx); + parallel_rng.GetState(tmp,gidx); std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); }); timer.Stop(); From acac2d693855c13691b344cb021293701ba04a8c Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 6 Oct 2020 17:57:00 +0100 Subject: [PATCH 013/201] standard C/C++ I/O in benchmark --- benchmarks/Benchmark_IO.cc | 30 ++++++++- benchmarks/Benchmark_IO.hpp | 131 +++++++++++++++++++++++++++++++++++- 2 files changed, 157 insertions(+), 4 deletions(-) diff --git a/benchmarks/Benchmark_IO.cc b/benchmarks/Benchmark_IO.cc index c8c0937f..b59e4741 100644 --- a/benchmarks/Benchmark_IO.cc +++ b/benchmarks/Benchmark_IO.cc @@ -19,6 +19,31 @@ int main (int argc, char ** argv) int64_t threads = GridThread::GetThreads(); MSG << "Grid is setup to use " << threads << " threads" << std::endl; + + MSG << SEP << std::endl; + MSG << "Benchmark std C++ write" << std::endl; + MSG << SEP << std::endl; + for (int l = 4; l <= BENCH_IO_LMAX; l += 2) + { + auto mpi = GridDefaultMpi(); + std::vector latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + + MSG << "-- Local volume " << l << "^4" << std::endl; + writeBenchmark(latt, filestem(l), stdWrite); + } + + MSG << SEP << std::endl; + MSG << "Benchmark std C++ read" << std::endl; + MSG << SEP << std::endl; + for (int l = 4; l <= BENCH_IO_LMAX; l += 2) + { + auto mpi = GridDefaultMpi(); + std::vector latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + + MSG << "-- Local volume " << l << "^4" << std::endl; + readBenchmark(latt, filestem(l), stdRead); + } + MSG << SEP << std::endl; MSG << "Benchmark Lime write" << std::endl; MSG << SEP << std::endl; @@ -27,10 +52,11 @@ int main (int argc, char ** argv) auto mpi = GridDefaultMpi(); std::vector latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; - std::cout << "-- Local volume " << l << "^4" << std::endl; + MSG << "-- Local volume " << l << "^4" << std::endl; writeBenchmark(latt, filestem(l), limeWrite); } + MSG << SEP << std::endl; MSG << "Benchmark Lime read" << std::endl; MSG << SEP << std::endl; for (int l = 4; l <= BENCH_IO_LMAX; l += 2) @@ -38,7 +64,7 @@ int main (int argc, char ** argv) auto mpi = GridDefaultMpi(); std::vector latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; - std::cout << "-- Local volume " << l << "^4" << std::endl; + MSG << "-- Local volume " << l << "^4" << std::endl; readBenchmark(latt, filestem(l), limeRead); } diff --git a/benchmarks/Benchmark_IO.hpp b/benchmarks/Benchmark_IO.hpp index d3416353..73c198dc 100644 --- a/benchmarks/Benchmark_IO.hpp +++ b/benchmarks/Benchmark_IO.hpp @@ -14,13 +14,140 @@ using WriterFn = std::function ; template using ReaderFn = std::function; +// AP 06/10/2020: Standard C version in case one is suspicious of the C++ API +// +// template +// void stdWrite(const std::string filestem, Field &vec) +// { +// std::string rankStr = std::to_string(vec.Grid()->ThisRank()); +// std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb"); +// size_t size; +// uint32_t crc; +// GridStopWatch ioWatch, crcWatch; + +// size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); +// autoView(vec_v, vec, CpuRead); +// crcWatch.Start(); +// crc = GridChecksum::crc32(vec_v.cpu_ptr, size); +// std::fwrite(&crc, sizeof(uint32_t), 1, file); +// crcWatch.Stop(); +// MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl; +// ioWatch.Start(); +// std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); +// ioWatch.Stop(); +// std::fclose(file); +// size *= vec.Grid()->ProcessorCount(); +// MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() +// << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) +// << " MB/s" << std::endl; +// MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; +// } +// +// template +// void stdRead(Field &vec, const std::string filestem) +// { +// std::string rankStr = std::to_string(vec.Grid()->ThisRank()); +// std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb"); +// size_t size; +// uint32_t crcRead, crcData; +// GridStopWatch ioWatch, crcWatch; + +// size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); +// crcWatch.Start(); +// std::fread(&crcRead, sizeof(uint32_t), 1, file); +// crcWatch.Stop(); +// { +// autoView(vec_v, vec, CpuWrite); +// ioWatch.Start(); +// std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); +// ioWatch.Stop(); +// std::fclose(file); +// } +// { +// autoView(vec_v, vec, CpuRead); +// crcWatch.Start(); +// crcData = GridChecksum::crc32(vec_v.cpu_ptr, size); +// crcWatch.Stop(); +// } +// MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl; +// assert(crcData == crcRead); +// size *= vec.Grid()->ProcessorCount(); +// MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() +// << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) +// << " MB/s" << std::endl; +// MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; +// } + +template +void stdWrite(const std::string filestem, Field &vec) +{ + std::string rankStr = std::to_string(vec.Grid()->ThisRank()); + std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary); + size_t size, sizec; + uint32_t crc; + GridStopWatch ioWatch, crcWatch; + + size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); + sizec = size/sizeof(char); // just in case of... + autoView(vec_v, vec, CpuRead); + crcWatch.Start(); + crc = GridChecksum::crc32(vec_v.cpu_ptr, size); + file.write(reinterpret_cast(&crc), sizeof(uint32_t)/sizeof(char)); + crcWatch.Stop(); + MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl; + ioWatch.Start(); + file.write(reinterpret_cast(vec_v.cpu_ptr), sizec); + file.flush(); + ioWatch.Stop(); + size *= vec.Grid()->ProcessorCount(); + MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() + << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) + << " MB/s" << std::endl; + MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; +} + +template +void stdRead(Field &vec, const std::string filestem) +{ + std::string rankStr = std::to_string(vec.Grid()->ThisRank()); + std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary); + size_t size, sizec; + uint32_t crcRead, crcData; + GridStopWatch ioWatch, crcWatch; + + size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); + sizec = size/sizeof(char); // just in case of... + crcWatch.Start(); + file.read(reinterpret_cast(&crcRead), sizeof(uint32_t)/sizeof(char)); + crcWatch.Stop(); + { + autoView(vec_v, vec, CpuWrite); + ioWatch.Start(); + file.read(reinterpret_cast(vec_v.cpu_ptr), sizec); + ioWatch.Stop(); + } + { + autoView(vec_v, vec, CpuRead); + crcWatch.Start(); + crcData = GridChecksum::crc32(vec_v.cpu_ptr, size); + crcWatch.Stop(); + } + MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl; + assert(crcData == crcRead); + size *= vec.Grid()->ProcessorCount(); + MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() + << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) + << " MB/s" << std::endl; + MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; +} + template void limeWrite(const std::string filestem, Field &vec) { emptyUserRecord record; ScidacWriter binWriter(vec.Grid()->IsBoss()); - binWriter.open(filestem + ".bin"); + binWriter.open(filestem + ".lime.bin"); binWriter.writeScidacFieldRecord(vec, record); binWriter.close(); } @@ -31,7 +158,7 @@ void limeRead(Field &vec, const std::string filestem) emptyUserRecord record; ScidacReader binReader; - binReader.open(filestem + ".bin"); + binReader.open(filestem + ".lime.bin"); binReader.readScidacFieldRecord(vec, record); binReader.close(); } From e9c5a271a886c90f73f41aba22f703292f75a1e5 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 6 Oct 2020 17:58:16 +0100 Subject: [PATCH 014/201] fixing potential issues with log alignment and timer I/O --- Grid/log/Log.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Grid/log/Log.h b/Grid/log/Log.h index d459a4a9..68693647 100644 --- a/Grid/log/Log.h +++ b/Grid/log/Log.h @@ -130,6 +130,8 @@ public: friend std::ostream& operator<< (std::ostream& stream, Logger& log){ if ( log.active ) { + std::ios_base::fmtflags f(stream.flags()); + stream << log.background()<< std::left; if (log.topWidth > 0) { @@ -152,6 +154,8 @@ public: << now << log.background() << " : " ; } stream << log.colour(); + stream.flags(f); + return stream; } else { return devnull; From 35a69a513365d7cdca79510f8925cdb91b8ebe71 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 6 Oct 2020 21:48:35 -0400 Subject: [PATCH 015/201] SU4 x SU4 --- benchmarks/Benchmark_ITT.cc | 78 ++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 5 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index dc09549c..5e1e1f66 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -202,6 +202,8 @@ public: return; } + + static void Memory(void) { const int Nvec=8; @@ -266,6 +268,66 @@ public: }; + static void SU4(void) + { + const int Nc4=4; + typedef Lattice< iMatrix< vComplexF,Nc4> > LatticeSU4; + + Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + std::cout<({45,12,81,9})); + for(int lat=8;lat<=lmax;lat+=8){ + + Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + NN =Grid.NodeCount(); + + + LatticeSU4 z(&Grid); z=Zero(); + LatticeSU4 x(&Grid); x=Zero(); + LatticeSU4 y(&Grid); y=Zero(); + double a=2.0; + + uint64_t Nloop=NLOOP; + + double start=usecond(); + for(int i=0;i L_list({16,24,32}); + int sel=4; + std::vector L_list({8,12,16,24,32}); int selm1=sel-1; std::vector wilson; @@ -624,7 +687,6 @@ int main (int argc, char ** argv) dwf4.push_back(result); } - /* std::cout<1) ) { std::cout< Date: Wed, 7 Oct 2020 15:31:51 +0100 Subject: [PATCH 016/201] I/O benchmark code cleaning --- benchmarks/Benchmark_IO.cc | 31 ++++++++-------- benchmarks/Benchmark_IO.hpp | 4 +-- benchmarks/Benchmark_IO_vs_dir.cc | 59 +++++++++++++++++-------------- 3 files changed, 50 insertions(+), 44 deletions(-) diff --git a/benchmarks/Benchmark_IO.cc b/benchmarks/Benchmark_IO.cc index b59e4741..5e4cef9f 100644 --- a/benchmarks/Benchmark_IO.cc +++ b/benchmarks/Benchmark_IO.cc @@ -14,61 +14,62 @@ std::string filestem(const int l) int main (int argc, char ** argv) { -#ifdef HAVE_LIME Grid_init(&argc,&argv); - int64_t threads = GridThread::GetThreads(); + int64_t threads = GridThread::GetThreads(); + auto mpi = GridDefaultMpi(); + std::vector latt; + MSG << "Grid is setup to use " << threads << " threads" << std::endl; + MSG << "MPI partition " << mpi << std::endl; MSG << SEP << std::endl; - MSG << "Benchmark std C++ write" << std::endl; + MSG << "Benchmark std write" << std::endl; MSG << SEP << std::endl; for (int l = 4; l <= BENCH_IO_LMAX; l += 2) { - auto mpi = GridDefaultMpi(); - std::vector latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; MSG << "-- Local volume " << l << "^4" << std::endl; writeBenchmark(latt, filestem(l), stdWrite); } MSG << SEP << std::endl; - MSG << "Benchmark std C++ read" << std::endl; + MSG << "Benchmark std read" << std::endl; MSG << SEP << std::endl; for (int l = 4; l <= BENCH_IO_LMAX; l += 2) { - auto mpi = GridDefaultMpi(); - std::vector latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; MSG << "-- Local volume " << l << "^4" << std::endl; readBenchmark(latt, filestem(l), stdRead); } +#ifdef HAVE_LIME MSG << SEP << std::endl; - MSG << "Benchmark Lime write" << std::endl; + MSG << "Benchmark Grid C-Lime write" << std::endl; MSG << SEP << std::endl; for (int l = 4; l <= BENCH_IO_LMAX; l += 2) { - auto mpi = GridDefaultMpi(); - std::vector latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; MSG << "-- Local volume " << l << "^4" << std::endl; writeBenchmark(latt, filestem(l), limeWrite); } MSG << SEP << std::endl; - MSG << "Benchmark Lime read" << std::endl; + MSG << "Benchmark Grid C-Lime read" << std::endl; MSG << SEP << std::endl; for (int l = 4; l <= BENCH_IO_LMAX; l += 2) { - auto mpi = GridDefaultMpi(); - std::vector latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; MSG << "-- Local volume " << l << "^4" << std::endl; readBenchmark(latt, filestem(l), limeRead); } +#endif Grid_finalize(); -#endif + return EXIT_SUCCESS; } diff --git a/benchmarks/Benchmark_IO.hpp b/benchmarks/Benchmark_IO.hpp index 73c198dc..39af14ba 100644 --- a/benchmarks/Benchmark_IO.hpp +++ b/benchmarks/Benchmark_IO.hpp @@ -101,7 +101,7 @@ void stdWrite(const std::string filestem, Field &vec) ioWatch.Stop(); size *= vec.Grid()->ProcessorCount(); MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() - << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) + << ", " << size/1024./1024./(ioWatch.useconds()/1.e6) << " MB/s" << std::endl; MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; } @@ -136,7 +136,7 @@ void stdRead(Field &vec, const std::string filestem) assert(crcData == crcRead); size *= vec.Grid()->ProcessorCount(); MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() - << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) + << ", " << size/1024./1024./(ioWatch.useconds()/1.e6) << " MB/s" << std::endl; MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; } diff --git a/benchmarks/Benchmark_IO_vs_dir.cc b/benchmarks/Benchmark_IO_vs_dir.cc index 6e6c9ae0..9c254e27 100644 --- a/benchmarks/Benchmark_IO_vs_dir.cc +++ b/benchmarks/Benchmark_IO_vs_dir.cc @@ -34,46 +34,51 @@ int main (int argc, char ** argv) } Grid_init(&argc,&argv); - int64_t threads = GridThread::GetThreads(); + auto mpi = GridDefaultMpi(); + MSG << "Grid is setup to use " << threads << " threads" << std::endl; - MSG << SEP << std::endl; - MSG << "Benchmark double precision Lime write" << std::endl; - MSG << SEP << std::endl; - for (auto &d: dir) - { - MSG << "-- Directory " << d << std::endl; - writeBenchmark(GridDefaultLatt(), d + "/ioBench", limeWrite, Ls, rb); - } + MSG << "MPI partition " << mpi << std::endl; MSG << SEP << std::endl; - MSG << "Benchmark double precision Lime read" << std::endl; + MSG << "Benchmark Grid C-Lime write" << std::endl; MSG << SEP << std::endl; for (auto &d: dir) { MSG << "-- Directory " << d << std::endl; - readBenchmark(GridDefaultLatt(), d + "/ioBench", limeRead, Ls, rb); + writeBenchmark(GridDefaultLatt(), d + "/ioBench", + limeWrite, Ls, rb); + } + MSG << SEP << std::endl; + MSG << "Benchmark Grid C-Lime read" << std::endl; + MSG << SEP << std::endl; + for (auto &d: dir) + { + MSG << "-- Directory " << d << std::endl; + readBenchmark(GridDefaultLatt(), d + "/ioBench", + limeRead, Ls, rb); } - MSG << SEP << std::endl; - MSG << "Benchmark single precision Lime write" << std::endl; - MSG << SEP << std::endl; - for (auto &d: dir) - { - MSG << "-- Directory " << d << std::endl; - writeBenchmark(GridDefaultLatt(), d + "/ioBench", limeWrite, Ls, rb); - } + // MSG << SEP << std::endl; + // MSG << "Benchmark single precision Lime write" << std::endl; + // MSG << SEP << std::endl; + // for (auto &d: dir) + // { + // MSG << "-- Directory " << d << std::endl; + // writeBenchmark(GridDefaultLatt(), d + "/ioBench", limeWrite, Ls, rb); + // } - MSG << SEP << std::endl; - MSG << "Benchmark single precision Lime read" << std::endl; - MSG << SEP << std::endl; - for (auto &d: dir) - { - MSG << "-- Directory " << d << std::endl; - readBenchmark(GridDefaultLatt(), d + "/ioBench", limeRead, Ls, rb); - } + // MSG << SEP << std::endl; + // MSG << "Benchmark single precision Lime read" << std::endl; + // MSG << SEP << std::endl; + // for (auto &d: dir) + // { + // MSG << "-- Directory " << d << std::endl; + // readBenchmark(GridDefaultLatt(), d + "/ioBench", limeRead, Ls, rb); + // } Grid_finalize(); + #endif return EXIT_SUCCESS; } From 9ba3647bdf466e4757284ed4a49ec4ee32f679c9 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Wed, 7 Oct 2020 15:35:03 +0100 Subject: [PATCH 017/201] script to convert I/O benchmark logs to CSV --- benchmarks/benchmark-io-csv.sh | 76 ++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100755 benchmarks/benchmark-io-csv.sh diff --git a/benchmarks/benchmark-io-csv.sh b/benchmarks/benchmark-io-csv.sh new file mode 100755 index 00000000..cc61b006 --- /dev/null +++ b/benchmarks/benchmark-io-csv.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash + +awkscript=' +BEGIN{ + i = 0; + print "local L,std read (MB/s),std write (MB/s),Grid Lime read (MB/s),Grid Lime write (MB/s)" +} + +/Benchmark std write/{ + i = 0; + mode = "stdWrite"; +} + +/Benchmark std read/{ + i = 0; + mode = "stdRead" +} + +/Benchmark Grid C-Lime write/{ + i = 0; + mode = "gridWrite"; +} + +/Benchmark Grid C-Lime read/{ + i = 0; + mode = "gridRead"; +} + +/Local volume/{ + match($0, "[0-9]+\\^4"); + l[i] = substr($0, RSTART, RLENGTH-2); +} + +/MB\/s/{ + match($0, "[0-9.eE]+ MB/s"); + p = substr($0, RSTART, RLENGTH-5); + if (mode == "stdWrite") + { + sw[i] = p; + } + else if (mode == "stdRead") + { + sr[i] = p; + } + else if (mode == "gridWrite") + { + gw[i] = p; + } + else if (mode == "gridRead") + { + gr[i] = p; + } + i++; +} + +END{ + s = 0 + for (a in l) + { + s++; + } + for (j = 0; j < s; j++) + { + printf("%s,%s,%s,%s,%s\n", l[j], sr[j], sw[j], gr[j], gw[j]); + } + printf("\n"); +} +' + +if (( $# != 1 )); then + echo "usage: `basename $0` " 1>&2 + exit 1 +fi +LOG=$1 + +awk "${awkscript}" ${LOG} From 1ba25a0d8c728f18dad32649c5be0572f79e27af Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Wed, 7 Oct 2020 15:38:41 +0100 Subject: [PATCH 018/201] more I/O benchmark code cleaning --- benchmarks/Benchmark_IO_vs_dir.cc | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/benchmarks/Benchmark_IO_vs_dir.cc b/benchmarks/Benchmark_IO_vs_dir.cc index 9c254e27..9ccfd554 100644 --- a/benchmarks/Benchmark_IO_vs_dir.cc +++ b/benchmarks/Benchmark_IO_vs_dir.cc @@ -8,7 +8,6 @@ using namespace Grid; int main (int argc, char ** argv) { -#ifdef HAVE_LIME std::vector dir; unsigned int Ls; bool rb; @@ -40,6 +39,26 @@ int main (int argc, char ** argv) MSG << "Grid is setup to use " << threads << " threads" << std::endl; MSG << "MPI partition " << mpi << std::endl; + MSG << SEP << std::endl; + MSG << "Benchmark Grid std write" << std::endl; + MSG << SEP << std::endl; + for (auto &d: dir) + { + MSG << "-- Directory " << d << std::endl; + writeBenchmark(GridDefaultLatt(), d + "/ioBench", + stdWrite, Ls, rb); + } + MSG << SEP << std::endl; + MSG << "Benchmark Grid std read" << std::endl; + MSG << SEP << std::endl; + for (auto &d: dir) + { + MSG << "-- Directory " << d << std::endl; + readBenchmark(GridDefaultLatt(), d + "/ioBench", + stdRead, Ls, rb); + } + +#ifdef HAVE_LIME MSG << SEP << std::endl; MSG << "Benchmark Grid C-Lime write" << std::endl; MSG << SEP << std::endl; @@ -58,6 +77,7 @@ int main (int argc, char ** argv) readBenchmark(GridDefaultLatt(), d + "/ioBench", limeRead, Ls, rb); } +#endif // MSG << SEP << std::endl; // MSG << "Benchmark single precision Lime write" << std::endl; @@ -78,7 +98,6 @@ int main (int argc, char ** argv) // } Grid_finalize(); - -#endif + return EXIT_SUCCESS; } From d2012776524c616955a688b6ae2e05097d84548e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 7 Oct 2020 13:07:00 -0400 Subject: [PATCH 019/201] Expose Nc as a compile time configure option. Remove precision option --- Grid/qcd/QCD.h | 2 +- benchmarks/Benchmark_ITT.cc | 16 ++++-- benchmarks/Benchmark_dwf.cc | 2 +- benchmarks/Benchmark_gparity.cc | 2 +- benchmarks/Benchmark_mooee.cc | 4 +- configure.ac | 52 +++++++++++++------ tests/IO/Test_ildg_io.cc | 2 +- tests/IO/Test_nersc_io.cc | 2 +- tests/Test_cayley_even_odd_vec.cc | 2 +- tests/Test_compressed_lanczos_hot_start.cc | 2 +- tests/Test_dwf_mixedcg_prec.cc | 2 +- tests/Test_dwf_mixedcg_prec_halfcomms.cc | 2 +- tests/core/Test_cf_coarsen_support.cc | 2 +- tests/core/Test_checker.cc | 2 +- tests/core/Test_contfrac_even_odd.cc | 2 +- tests/core/Test_dwf_eofa_even_odd.cc | 2 +- tests/core/Test_dwf_even_odd.cc | 2 +- tests/core/Test_fft.cc | 2 +- tests/core/Test_fft_gfix.cc | 8 +-- tests/core/Test_gparity.cc | 2 +- tests/core/Test_gpwilson_even_odd.cc | 2 +- tests/core/Test_lie_generators.cc | 46 ++++++++-------- tests/core/Test_main.cc | 2 +- tests/core/Test_mobius_eofa_even_odd.cc | 2 +- tests/core/Test_quenched_update.cc | 6 +-- tests/core/Test_staggered.cc | 2 +- tests/core/Test_staggered5D.cc | 2 +- tests/core/Test_staggered5Dvec.cc | 2 +- tests/core/Test_staggered5DvecF.cc | 2 +- tests/core/Test_staggered_naive.cc | 2 +- tests/core/Test_wilson_clover.cc | 2 +- tests/core/Test_wilson_even_odd.cc | 2 +- .../core/Test_wilson_twisted_mass_even_odd.cc | 2 +- tests/debug/Test_cayley_cg.cc | 2 +- tests/debug/Test_cayley_coarsen_support.cc | 2 +- tests/debug/Test_cayley_even_odd.cc | 2 +- tests/debug/Test_cayley_ldop_cr.cc | 6 +-- tests/debug/Test_cayley_mres.cc | 4 +- tests/debug/Test_heatbath_dwf_eofa.cc | 2 +- tests/debug/Test_heatbath_dwf_eofa_gparity.cc | 2 +- tests/debug/Test_heatbath_mobius_eofa.cc | 2 +- .../Test_heatbath_mobius_eofa_gparity.cc | 2 +- tests/debug/Test_reweight_dwf_eofa.cc | 2 +- tests/debug/Test_reweight_dwf_eofa_gparity.cc | 2 +- tests/debug/Test_reweight_mobius_eofa.cc | 2 +- .../Test_reweight_mobius_eofa_gparity.cc | 2 +- tests/forces/Test_contfrac_force.cc | 4 +- tests/forces/Test_dwf_force.cc | 4 +- tests/forces/Test_dwf_force_eofa.cc | 4 +- tests/forces/Test_dwf_gpforce.cc | 6 +-- tests/forces/Test_dwf_gpforce_eofa.cc | 4 +- tests/forces/Test_gp_plaq_force.cc | 4 +- tests/forces/Test_gp_rect_force.cc | 4 +- tests/forces/Test_gpdwf_force.cc | 4 +- tests/forces/Test_gpwilson_force.cc | 4 +- tests/forces/Test_laplacian_force.cc | 4 +- tests/forces/Test_mobius_force.cc | 4 +- tests/forces/Test_mobius_force_eofa.cc | 4 +- tests/forces/Test_mobius_gpforce_eofa.cc | 4 +- tests/forces/Test_partfrac_force.cc | 4 +- tests/forces/Test_rect_force.cc | 4 +- tests/forces/Test_wilson_force.cc | 4 +- tests/forces/Test_wilsonclover_force.cc | 6 +-- tests/forces/Test_zmobius_force.cc | 4 +- ..._dwf_compressed_lanczos_reorg_synthetic.cc | 2 +- tests/lanczos/Test_dwf_lanczos.cc | 2 +- tests/lanczos/Test_wilson_lanczos.cc | 2 +- tests/qdpxx/Test_qdpxx_baryon.cc | 2 +- tests/qdpxx/Test_qdpxx_loops_staples.cc | 2 +- tests/qdpxx/Test_qdpxx_munprec.cc | 2 +- tests/qdpxx/Test_qdpxx_stag.cc | 2 +- tests/qdpxx/Test_qdpxx_wilson.cc | 2 +- tests/smearing/Test_smearing.cc | 4 +- tests/solver/Test_cf_cr_unprec.cc | 2 +- tests/solver/Test_contfrac_cg.cc | 2 +- tests/solver/Test_dwf_cg_prec.cc | 2 +- tests/solver/Test_dwf_cg_schur.cc | 2 +- tests/solver/Test_dwf_cg_unprec.cc | 2 +- tests/solver/Test_dwf_cr_unprec.cc | 2 +- tests/solver/Test_dwf_fpgcr.cc | 2 +- tests/solver/Test_dwf_mrhs_cg.cc | 2 +- tests/solver/Test_dwf_mrhs_cg_mpi.cc | 4 +- tests/solver/Test_dwf_mrhs_cg_mpieo.cc | 2 +- tests/solver/Test_dwf_qmr_unprec.cc | 2 +- tests/solver/Test_mobius_bcg.cc | 2 +- tests/solver/Test_mobius_bcg_nosplit.cc | 4 +- tests/solver/Test_mobius_bcg_phys_nosplit.cc | 4 +- tests/solver/Test_mobius_bcg_prec_nosplit.cc | 4 +- tests/solver/Test_split_grid.cc | 2 +- tests/solver/Test_staggered_block_cg_prec.cc | 2 +- .../solver/Test_staggered_block_cg_unprec.cc | 2 +- tests/solver/Test_staggered_cagmres_unprec.cc | 2 +- tests/solver/Test_staggered_cg_prec.cc | 2 +- tests/solver/Test_staggered_cg_schur.cc | 2 +- tests/solver/Test_staggered_cg_unprec.cc | 2 +- tests/solver/Test_staggered_fcagmres_prec.cc | 2 +- tests/solver/Test_staggered_fgmres_prec.cc | 2 +- tests/solver/Test_staggered_gmres_unprec.cc | 2 +- tests/solver/Test_staggered_mr_unprec.cc | 2 +- tests/solver/Test_staggered_multishift.cc | 2 +- tests/solver/Test_wilson_cagmres_unprec.cc | 2 +- tests/solver/Test_wilson_cg_prec.cc | 2 +- tests/solver/Test_wilson_cg_schur.cc | 2 +- tests/solver/Test_wilson_cg_unprec.cc | 2 +- tests/solver/Test_wilson_cr_unprec.cc | 2 +- tests/solver/Test_wilson_fcagmres_prec.cc | 2 +- tests/solver/Test_wilson_fgmres_prec.cc | 2 +- tests/solver/Test_wilson_gmres_unprec.cc | 2 +- tests/solver/Test_wilson_mg.cc | 2 +- tests/solver/Test_wilson_mg_mp.cc | 2 +- tests/solver/Test_wilson_mr_unprec.cc | 2 +- tests/solver/Test_wilson_qmr_unprec.cc | 2 +- .../solver/Test_wilsonclover_bicgstab_prec.cc | 2 +- .../Test_wilsonclover_bicgstab_schur.cc | 2 +- .../Test_wilsonclover_bicgstab_unprec.cc | 2 +- .../Test_wilsonclover_cagmres_unprec.cc | 2 +- tests/solver/Test_wilsonclover_cg_prec.cc | 2 +- tests/solver/Test_wilsonclover_cg_schur.cc | 2 +- tests/solver/Test_wilsonclover_cg_unprec.cc | 2 +- .../solver/Test_wilsonclover_fcagmres_prec.cc | 2 +- tests/solver/Test_wilsonclover_fgmres_prec.cc | 2 +- .../solver/Test_wilsonclover_gmres_unprec.cc | 2 +- tests/solver/Test_wilsonclover_mg.cc | 2 +- tests/solver/Test_wilsonclover_mg_lime.cc | 2 +- tests/solver/Test_wilsonclover_mg_mp.cc | 2 +- .../Test_wilsonclover_mixedbicgstab_prec.cc | 2 +- .../solver/Test_wilsonclover_mixedcg_prec.cc | 2 +- tests/solver/Test_wilsonclover_mr_unprec.cc | 2 +- tests/solver/Test_zMADWF_prec.cc | 2 +- tests/solver/Test_zmobius_cg_prec.cc | 2 +- 130 files changed, 232 insertions(+), 204 deletions(-) diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h index faacac63..76d7def4 100644 --- a/Grid/qcd/QCD.h +++ b/Grid/qcd/QCD.h @@ -47,7 +47,7 @@ static constexpr int Ym = 5; static constexpr int Zm = 6; static constexpr int Tm = 7; -static constexpr int Nc=3; +static constexpr int Nc=Config_Nc; static constexpr int Ns=4; static constexpr int Nd=4; static constexpr int Nhs=2; // half spinor diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 5e1e1f66..df5427c1 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -358,6 +358,7 @@ public: ///////// Welcome message //////////// std::cout<::HotConfiguration(RNG4,Umu); Fermion src (FGrid); random(RNG5,src); Fermion src_e (FrbGrid); Fermion src_o (FrbGrid); @@ -449,7 +450,13 @@ public: FGrid->Barrier(); double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops; if ( mflops::HotConfiguration(RNG4,Umu); typename Action::ImplParams params; Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params); diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 2ef5921d..d7b49122 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -108,7 +108,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "Drawing gauge field" << std::endl; LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4,Umu); + SU::HotConfiguration(RNG4,Umu); std::cout << GridLogMessage << "Random gauge initialised " << std::endl; #if 0 Umu=1.0; diff --git a/benchmarks/Benchmark_gparity.cc b/benchmarks/Benchmark_gparity.cc index b03e1b63..7fa7508a 100644 --- a/benchmarks/Benchmark_gparity.cc +++ b/benchmarks/Benchmark_gparity.cc @@ -63,7 +63,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "Drawing gauge field" << std::endl; LatticeGaugeFieldF Umu(UGrid); - SU3::HotConfiguration(RNG4,Umu); + SU::HotConfiguration(RNG4,Umu); std::cout << GridLogMessage << "Random gauge initialised " << std::endl; RealD mass=0.1; diff --git a/benchmarks/Benchmark_mooee.cc b/benchmarks/Benchmark_mooee.cc index ef16c908..0aaccecc 100644 --- a/benchmarks/Benchmark_mooee.cc +++ b/benchmarks/Benchmark_mooee.cc @@ -30,7 +30,7 @@ Author: paboyle using namespace std; using namespace Grid; - ; + int main (int argc, char ** argv) @@ -53,7 +53,7 @@ int main (int argc, char ** argv) GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); std::cout << GridLogMessage << "Seeded"<::HotConfiguration(RNG4,Umu); std::cout << GridLogMessage << "made random gauge fields"< U(4,&Fine); - SU3::HotConfiguration(pRNGa,Umu); + SU::HotConfiguration(pRNGa,Umu); FieldMetaData header; diff --git a/tests/IO/Test_nersc_io.cc b/tests/IO/Test_nersc_io.cc index f5413e3b..c15c320e 100644 --- a/tests/IO/Test_nersc_io.cc +++ b/tests/IO/Test_nersc_io.cc @@ -84,7 +84,7 @@ int main (int argc, char ** argv) std::vector U(4,&Fine); - SU3::HotConfiguration(pRNGa,Umu); + SU::HotConfiguration(pRNGa,Umu); FieldMetaData header; std::string file("./ckpoint_lat.4000"); diff --git a/tests/Test_cayley_even_odd_vec.cc b/tests/Test_cayley_even_odd_vec.cc index 0e71d910..c345efd9 100644 --- a/tests/Test_cayley_even_odd_vec.cc +++ b/tests/Test_cayley_even_odd_vec.cc @@ -80,7 +80,7 @@ int main (int argc, char ** argv) GridParallelRNG sRNG5(sFGrid); sRNG5.SeedFixedIntegers(seeds5); LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4,Umu); + SU::HotConfiguration(RNG4,Umu); RealD mass=0.1; RealD M5 =1.8; diff --git a/tests/Test_compressed_lanczos_hot_start.cc b/tests/Test_compressed_lanczos_hot_start.cc index 8eb7a921..dc22cfca 100644 --- a/tests/Test_compressed_lanczos_hot_start.cc +++ b/tests/Test_compressed_lanczos_hot_start.cc @@ -202,7 +202,7 @@ int main (int argc, char ** argv) { std::vector seeds4({1,2,3,4}); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4,Umu); + SU::HotConfiguration(RNG4,Umu); // FieldMetaData header; // NerscIO::readConfiguration(Umu,header,Params.config); diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index be881db9..da0b54cd 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -71,7 +71,7 @@ int main (int argc, char ** argv) LatticeGaugeFieldD Umu(UGrid); LatticeGaugeFieldF Umu_f(UGrid_f); - SU3::HotConfiguration(RNG4,Umu); + SU::HotConfiguration(RNG4,Umu); precisionChange(Umu_f,Umu); diff --git a/tests/Test_dwf_mixedcg_prec_halfcomms.cc b/tests/Test_dwf_mixedcg_prec_halfcomms.cc index 4d94632c..8b0126dc 100644 --- a/tests/Test_dwf_mixedcg_prec_halfcomms.cc +++ b/tests/Test_dwf_mixedcg_prec_halfcomms.cc @@ -69,7 +69,7 @@ int main (int argc, char ** argv) LatticeGaugeFieldD Umu(UGrid); LatticeGaugeFieldF Umu_f(UGrid_f); - SU3::HotConfiguration(RNG4,Umu); + SU::HotConfiguration(RNG4,Umu); precisionChange(Umu_f,Umu); diff --git a/tests/core/Test_cf_coarsen_support.cc b/tests/core/Test_cf_coarsen_support.cc index e787905e..ad0309b9 100644 --- a/tests/core/Test_cf_coarsen_support.cc +++ b/tests/core/Test_cf_coarsen_support.cc @@ -64,7 +64,7 @@ int main (int argc, char ** argv) LatticeFermion ref(FGrid); ref=Zero(); LatticeFermion tmp(FGrid); LatticeFermion err(FGrid); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); for(int mu=0;mu::HotConfiguration(RNG4,Umu); // std::vector U(4,UGrid); // for(int mu=0;mu::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); RealD mass=0.1; diff --git a/tests/core/Test_dwf_eofa_even_odd.cc b/tests/core/Test_dwf_eofa_even_odd.cc index 01fff9ea..64701069 100644 --- a/tests/core/Test_dwf_eofa_even_odd.cc +++ b/tests/core/Test_dwf_eofa_even_odd.cc @@ -73,7 +73,7 @@ int main (int argc, char ** argv) LatticeFermion ref (FGrid); ref = Zero(); LatticeFermion tmp (FGrid); tmp = Zero(); LatticeFermion err (FGrid); err = Zero(); - LatticeGaugeField Umu (UGrid); SU3::HotConfiguration(RNG4, Umu); + LatticeGaugeField Umu (UGrid); SU::HotConfiguration(RNG4, Umu); std::vector U(4,UGrid); // Only one non-zero (y) diff --git a/tests/core/Test_dwf_even_odd.cc b/tests/core/Test_dwf_even_odd.cc index 6093ee8f..4918f02a 100644 --- a/tests/core/Test_dwf_even_odd.cc +++ b/tests/core/Test_dwf_even_odd.cc @@ -72,7 +72,7 @@ int main (int argc, char ** argv) LatticeFermion ref(FGrid); ref=Zero(); LatticeFermion tmp(FGrid); tmp=Zero(); LatticeFermion err(FGrid); tmp=Zero(); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); // Only one non-zero (y) diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc index 2ba3752b..212b1a35 100644 --- a/tests/core/Test_fft.cc +++ b/tests/core/Test_fft.cc @@ -138,7 +138,7 @@ int main (int argc, char ** argv) LatticeGaugeFieldD Umu(&GRID); - SU3::ColdConfiguration(pRNG,Umu); // Unit gauge + SU::ColdConfiguration(pRNG,Umu); // Unit gauge // Umu=Zero(); //////////////////////////////////////////////////// // Wilson test diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc index 228770a8..87dbc242 100644 --- a/tests/core/Test_fft_gfix.cc +++ b/tests/core/Test_fft_gfix.cc @@ -73,11 +73,11 @@ int main (int argc, char ** argv) LatticeColourMatrix xform2(&GRID); // Gauge xform LatticeColourMatrix xform3(&GRID); // Gauge xform - SU3::ColdConfiguration(pRNG,Umu); // Unit gauge + SU::ColdConfiguration(pRNG,Umu); // Unit gauge Uorg=Umu; Urnd=Umu; - SU3::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge + SU::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge Real plaq=WilsonLoops::avgPlaquette(Umu); std::cout << " Initial plaquette "<::HotConfiguration(pRNG,Umu); // Unit gauge plaq=WilsonLoops::avgPlaquette(Umu); std::cout << " Initial plaquette "<::HotConfiguration(pRNG,Umu); // Unit gauge plaq=WilsonLoops::avgPlaquette(Umu); std::cout << " Initial plaquette "<::HotConfiguration(RNG4_2f,Umu_2f); StandardFermionField src (FGrid_2f); StandardFermionField tmpsrc(FGrid_2f); diff --git a/tests/core/Test_gpwilson_even_odd.cc b/tests/core/Test_gpwilson_even_odd.cc index bf37f4d5..69ace859 100644 --- a/tests/core/Test_gpwilson_even_odd.cc +++ b/tests/core/Test_gpwilson_even_odd.cc @@ -61,7 +61,7 @@ int main (int argc, char ** argv) FermionField ref(&Grid); ref=Zero(); FermionField tmp(&Grid); tmp=Zero(); FermionField err(&Grid); tmp=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); double volume=1; diff --git a/tests/core/Test_lie_generators.cc b/tests/core/Test_lie_generators.cc index 471cea25..9ae59774 100644 --- a/tests/core/Test_lie_generators.cc +++ b/tests/core/Test_lie_generators.cc @@ -66,14 +66,14 @@ int main(int argc, char** argv) { std::cout << GridLogMessage << "*********************************************" << std::endl; - std::cout << GridLogMessage << "* Generators for SU(3)" << std::endl; + std::cout << GridLogMessage << "* Generators for SU(Nc" << std::endl; std::cout << GridLogMessage << "*********************************************" << std::endl; - SU3::printGenerators(); - std::cout << "Dimension of adjoint representation: "<< SU3Adjoint::Dimension << std::endl; - SU3Adjoint::printGenerators(); - SU3::testGenerators(); - SU3Adjoint::testGenerators(); + SU::printGenerators(); + std::cout << "Dimension of adjoint representation: "<< SUAdjoint::Dimension << std::endl; + SUAdjoint::printGenerators(); + SU::testGenerators(); + SUAdjoint::testGenerators(); std::cout<({45,12,81,9})); - SU3Adjoint::LatticeAdjMatrix Gauss(grid); - SU3::LatticeAlgebraVector ha(grid); - SU3::LatticeAlgebraVector hb(grid); + SUAdjoint::LatticeAdjMatrix Gauss(grid); + SU::LatticeAlgebraVector ha(grid); + SU::LatticeAlgebraVector hb(grid); random(gridRNG,Gauss); std::cout << GridLogMessage << "Start projectOnAlgebra" << std::endl; - SU3Adjoint::projectOnAlgebra(ha, Gauss); + SUAdjoint::projectOnAlgebra(ha, Gauss); std::cout << GridLogMessage << "end projectOnAlgebra" << std::endl; std::cout << GridLogMessage << "Start projector" << std::endl; - SU3Adjoint::projector(hb, Gauss); + SUAdjoint::projector(hb, Gauss); std::cout << GridLogMessage << "end projector" << std::endl; std::cout << GridLogMessage << "ReStart projector" << std::endl; - SU3Adjoint::projector(hb, Gauss); + SUAdjoint::projector(hb, Gauss); std::cout << GridLogMessage << "end projector" << std::endl; - SU3::LatticeAlgebraVector diff = ha -hb; + SU::LatticeAlgebraVector diff = ha -hb; std::cout << GridLogMessage << "Difference: " << norm2(diff) << std::endl; @@ -260,20 +260,20 @@ int main(int argc, char** argv) { std::cout << GridLogMessage << "Test for the Two Index Symmetric projectors" << std::endl; // Projectors - SU3TwoIndexSymm::LatticeTwoIndexMatrix Gauss2(grid); + SUTwoIndexSymm::LatticeTwoIndexMatrix Gauss2(grid); random(gridRNG,Gauss2); std::cout << GridLogMessage << "Start projectOnAlgebra" << std::endl; - SU3TwoIndexSymm::projectOnAlgebra(ha, Gauss2); + SUTwoIndexSymm::projectOnAlgebra(ha, Gauss2); std::cout << GridLogMessage << "end projectOnAlgebra" << std::endl; std::cout << GridLogMessage << "Start projector" << std::endl; - SU3TwoIndexSymm::projector(hb, Gauss2); + SUTwoIndexSymm::projector(hb, Gauss2); std::cout << GridLogMessage << "end projector" << std::endl; std::cout << GridLogMessage << "ReStart projector" << std::endl; - SU3TwoIndexSymm::projector(hb, Gauss2); + SUTwoIndexSymm::projector(hb, Gauss2); std::cout << GridLogMessage << "end projector" << std::endl; - SU3::LatticeAlgebraVector diff2 = ha - hb; + SU::LatticeAlgebraVector diff2 = ha - hb; std::cout << GridLogMessage << "Difference: " << norm2(diff) << std::endl; std::cout << GridLogMessage << "*********************************************" << std::endl; @@ -284,20 +284,20 @@ int main(int argc, char** argv) { std::cout << GridLogMessage << "Test for the Two index anti-Symmetric projectors" << std::endl; // Projectors - SU3TwoIndexAntiSymm::LatticeTwoIndexMatrix Gauss2a(grid); + SUTwoIndexAntiSymm::LatticeTwoIndexMatrix Gauss2a(grid); random(gridRNG,Gauss2a); std::cout << GridLogMessage << "Start projectOnAlgebra" << std::endl; - SU3TwoIndexAntiSymm::projectOnAlgebra(ha, Gauss2a); + SUTwoIndexAntiSymm::projectOnAlgebra(ha, Gauss2a); std::cout << GridLogMessage << "end projectOnAlgebra" << std::endl; std::cout << GridLogMessage << "Start projector" << std::endl; - SU3TwoIndexAntiSymm::projector(hb, Gauss2a); + SUTwoIndexAntiSymm::projector(hb, Gauss2a); std::cout << GridLogMessage << "end projector" << std::endl; std::cout << GridLogMessage << "ReStart projector" << std::endl; - SU3TwoIndexAntiSymm::projector(hb, Gauss2a); + SUTwoIndexAntiSymm::projector(hb, Gauss2a); std::cout << GridLogMessage << "end projector" << std::endl; - SU3::LatticeAlgebraVector diff2a = ha - hb; + SU::LatticeAlgebraVector diff2a = ha - hb; std::cout << GridLogMessage << "Difference: " << norm2(diff2a) << std::endl; std::cout << GridLogMessage << "*********************************************" << std::endl; diff --git a/tests/core/Test_main.cc b/tests/core/Test_main.cc index af8b747b..d7ed04ba 100644 --- a/tests/core/Test_main.cc +++ b/tests/core/Test_main.cc @@ -444,7 +444,7 @@ int main(int argc, char **argv) { // Lattice 12x12 GEMM scFooBar = scFoo * scBar; - // Benchmark some simple operations LatticeSU3 * Lattice SU3. + // Benchmark some simple operations LatticeSU * Lattice SU. double t0, t1, flops; double bytes; int ncall = 5000; diff --git a/tests/core/Test_mobius_eofa_even_odd.cc b/tests/core/Test_mobius_eofa_even_odd.cc index 68091229..7339f156 100644 --- a/tests/core/Test_mobius_eofa_even_odd.cc +++ b/tests/core/Test_mobius_eofa_even_odd.cc @@ -73,7 +73,7 @@ int main (int argc, char ** argv) LatticeFermion ref (FGrid); ref = Zero(); LatticeFermion tmp (FGrid); tmp = Zero(); LatticeFermion err (FGrid); err = Zero(); - LatticeGaugeField Umu (UGrid); SU3::HotConfiguration(RNG4, Umu); + LatticeGaugeField Umu (UGrid); SU::HotConfiguration(RNG4, Umu); std::vector U(4,UGrid); // Only one non-zero (y) diff --git a/tests/core/Test_quenched_update.cc b/tests/core/Test_quenched_update.cc index ef428d1b..22675913 100644 --- a/tests/core/Test_quenched_update.cc +++ b/tests/core/Test_quenched_update.cc @@ -55,7 +55,7 @@ int main (int argc, char ** argv) GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds); GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds); - // SU3 colour operatoions + // SU colour operatoions LatticeColourMatrix link(grid); LatticeColourMatrix staple(grid); @@ -87,10 +87,10 @@ int main (int argc, char ** argv) link = PeekIndex(Umu,mu); - for( int subgroup=0;subgroup::su2subgroups();subgroup++ ) { // update Even checkerboard - SU3::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask); + SU::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask); } diff --git a/tests/core/Test_staggered.cc b/tests/core/Test_staggered.cc index 1f42ff0d..51f92993 100644 --- a/tests/core/Test_staggered.cc +++ b/tests/core/Test_staggered.cc @@ -64,7 +64,7 @@ int main (int argc, char ** argv) FermionField err(&Grid); tmp=Zero(); FermionField phi (&Grid); random(pRNG,phi); FermionField chi (&Grid); random(pRNG,chi); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); diff --git a/tests/core/Test_staggered5D.cc b/tests/core/Test_staggered5D.cc index 3d175890..6ab15873 100644 --- a/tests/core/Test_staggered5D.cc +++ b/tests/core/Test_staggered5D.cc @@ -75,7 +75,7 @@ int main (int argc, char ** argv) FermionField phi (FGrid); random(pRNG5,phi); FermionField chi (FGrid); random(pRNG5,chi); - LatticeGaugeField Umu(UGrid); SU3::ColdConfiguration(pRNG4,Umu); + LatticeGaugeField Umu(UGrid); SU::ColdConfiguration(pRNG4,Umu); LatticeGaugeField Umua(UGrid); Umua=Umu; double volume=Ls; diff --git a/tests/core/Test_staggered5Dvec.cc b/tests/core/Test_staggered5Dvec.cc index 73241276..ef8da662 100644 --- a/tests/core/Test_staggered5Dvec.cc +++ b/tests/core/Test_staggered5Dvec.cc @@ -84,7 +84,7 @@ int main (int argc, char ** argv) FermionField chi (FGrid); random(pRNG5,chi); LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(pRNG4,Umu); + SU::HotConfiguration(pRNG4,Umu); /* for(int mu=1;mu<4;mu++){ diff --git a/tests/core/Test_staggered5DvecF.cc b/tests/core/Test_staggered5DvecF.cc index 2386d054..6893551c 100644 --- a/tests/core/Test_staggered5DvecF.cc +++ b/tests/core/Test_staggered5DvecF.cc @@ -83,7 +83,7 @@ int main (int argc, char ** argv) FermionField chi (FGrid); random(pRNG5,chi); LatticeGaugeFieldF Umu(UGrid); - SU3::HotConfiguration(pRNG4,Umu); + SU::HotConfiguration(pRNG4,Umu); /* for(int mu=1;mu<4;mu++){ diff --git a/tests/core/Test_staggered_naive.cc b/tests/core/Test_staggered_naive.cc index 9fe35a54..f41d723d 100644 --- a/tests/core/Test_staggered_naive.cc +++ b/tests/core/Test_staggered_naive.cc @@ -64,7 +64,7 @@ int main (int argc, char ** argv) FermionField err(&Grid); tmp=Zero(); FermionField phi (&Grid); random(pRNG,phi); FermionField chi (&Grid); random(pRNG,chi); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); diff --git a/tests/core/Test_wilson_clover.cc b/tests/core/Test_wilson_clover.cc index 3e31f7f6..642c30a8 100644 --- a/tests/core/Test_wilson_clover.cc +++ b/tests/core/Test_wilson_clover.cc @@ -74,7 +74,7 @@ int main(int argc, char **argv) FermionField chi(&Grid); random(pRNG, chi); LatticeGaugeField Umu(&Grid); - SU3::HotConfiguration(pRNG, Umu); + SU::HotConfiguration(pRNG, Umu); std::vector U(4, &Grid); double volume = 1; diff --git a/tests/core/Test_wilson_even_odd.cc b/tests/core/Test_wilson_even_odd.cc index dc49cf81..e7733a79 100644 --- a/tests/core/Test_wilson_even_odd.cc +++ b/tests/core/Test_wilson_even_odd.cc @@ -70,7 +70,7 @@ int main (int argc, char ** argv) LatticeFermion tmp(&Grid); tmp=Zero(); LatticeFermion err(&Grid); tmp=Zero(); LatticeGaugeField Umu(&Grid); - SU3::HotConfiguration(pRNG,Umu); + SU::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); double volume=1; diff --git a/tests/core/Test_wilson_twisted_mass_even_odd.cc b/tests/core/Test_wilson_twisted_mass_even_odd.cc index ba80fd0e..e0f73456 100644 --- a/tests/core/Test_wilson_twisted_mass_even_odd.cc +++ b/tests/core/Test_wilson_twisted_mass_even_odd.cc @@ -71,7 +71,7 @@ int main (int argc, char ** argv) LatticeFermion ref(&Grid); ref=Zero(); LatticeFermion tmp(&Grid); tmp=Zero(); LatticeFermion err(&Grid); tmp=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); double volume=1; diff --git a/tests/debug/Test_cayley_cg.cc b/tests/debug/Test_cayley_cg.cc index 5a9c696f..5418a8af 100644 --- a/tests/debug/Test_cayley_cg.cc +++ b/tests/debug/Test_cayley_cg.cc @@ -116,7 +116,7 @@ int main (int argc, char ** argv) LatticeGaugeField Umu(UGrid); LatticeGaugeFieldF UmuF(UGridF); - SU3::HotConfiguration(RNG4,Umu); + SU::HotConfiguration(RNG4,Umu); precisionChange(UmuF,Umu); std::vector U(4,UGrid); diff --git a/tests/debug/Test_cayley_coarsen_support.cc b/tests/debug/Test_cayley_coarsen_support.cc index e91b3070..b2f691d7 100644 --- a/tests/debug/Test_cayley_coarsen_support.cc +++ b/tests/debug/Test_cayley_coarsen_support.cc @@ -77,7 +77,7 @@ int main (int argc, char ** argv) LatticeFermion ref(FGrid); ref=Zero(); LatticeFermion tmp(FGrid); LatticeFermion err(FGrid); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4,Umu); #if 0 std::vector U(4,UGrid); diff --git a/tests/debug/Test_cayley_even_odd.cc b/tests/debug/Test_cayley_even_odd.cc index 433f0722..5e800b26 100644 --- a/tests/debug/Test_cayley_even_odd.cc +++ b/tests/debug/Test_cayley_even_odd.cc @@ -70,7 +70,7 @@ int main (int argc, char ** argv) GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); RealD mass=0.1; diff --git a/tests/debug/Test_cayley_ldop_cr.cc b/tests/debug/Test_cayley_ldop_cr.cc index 82f388ab..416017e5 100644 --- a/tests/debug/Test_cayley_ldop_cr.cc +++ b/tests/debug/Test_cayley_ldop_cr.cc @@ -71,9 +71,9 @@ int main (int argc, char ** argv) std::string file("./ckpoint_lat.400"); NerscIO::readConfiguration(Umu,header,file); - // SU3::ColdConfiguration(RNG4,Umu); - // SU3::TepidConfiguration(RNG4,Umu); - // SU3::HotConfiguration(RNG4,Umu); + // SU::ColdConfiguration(RNG4,Umu); + // SU::TepidConfiguration(RNG4,Umu); + // SU::HotConfiguration(RNG4,Umu); // Umu=Zero(); RealD mass=0.1; diff --git a/tests/debug/Test_cayley_mres.cc b/tests/debug/Test_cayley_mres.cc index 2ad605b8..2e56fa81 100644 --- a/tests/debug/Test_cayley_mres.cc +++ b/tests/debug/Test_cayley_mres.cc @@ -108,8 +108,8 @@ int main (int argc, char ** argv) GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); LatticeGaugeField Umu(UGrid); - SU3::ColdConfiguration(Umu); - // SU3::HotConfiguration(RNG4,Umu); + SU::ColdConfiguration(Umu); + // SU::HotConfiguration(RNG4,Umu); RealD mass=0.3; RealD M5 =1.0; diff --git a/tests/debug/Test_heatbath_dwf_eofa.cc b/tests/debug/Test_heatbath_dwf_eofa.cc index 1e64a568..9d453a96 100644 --- a/tests/debug/Test_heatbath_dwf_eofa.cc +++ b/tests/debug/Test_heatbath_dwf_eofa.cc @@ -73,7 +73,7 @@ int main(int argc, char** argv) // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); DomainWallEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mpv, 0.0, -1, M5); DomainWallEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0, 1, M5); diff --git a/tests/debug/Test_heatbath_dwf_eofa_gparity.cc b/tests/debug/Test_heatbath_dwf_eofa_gparity.cc index cc118d1d..22cc1e90 100644 --- a/tests/debug/Test_heatbath_dwf_eofa_gparity.cc +++ b/tests/debug/Test_heatbath_dwf_eofa_gparity.cc @@ -77,7 +77,7 @@ int main(int argc, char** argv) // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); // GparityDomainWallFermionR::ImplParams params; FermionAction::ImplParams params; diff --git a/tests/debug/Test_heatbath_mobius_eofa.cc b/tests/debug/Test_heatbath_mobius_eofa.cc index 95ab935e..4cf4bf53 100644 --- a/tests/debug/Test_heatbath_mobius_eofa.cc +++ b/tests/debug/Test_heatbath_mobius_eofa.cc @@ -75,7 +75,7 @@ int main(int argc, char** argv) // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); MobiusEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mpv, 0.0, -1, M5, b, c); MobiusEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0, 1, M5, b, c); diff --git a/tests/debug/Test_heatbath_mobius_eofa_gparity.cc b/tests/debug/Test_heatbath_mobius_eofa_gparity.cc index 7ed3a308..2fcb4b9f 100644 --- a/tests/debug/Test_heatbath_mobius_eofa_gparity.cc +++ b/tests/debug/Test_heatbath_mobius_eofa_gparity.cc @@ -79,7 +79,7 @@ int main(int argc, char** argv) // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); FermionAction::ImplParams params; FermionAction Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mpv, 0.0, -1, M5, b, c, params); diff --git a/tests/debug/Test_reweight_dwf_eofa.cc b/tests/debug/Test_reweight_dwf_eofa.cc index 728fbf78..a150b18f 100644 --- a/tests/debug/Test_reweight_dwf_eofa.cc +++ b/tests/debug/Test_reweight_dwf_eofa.cc @@ -102,7 +102,7 @@ int main(int argc, char **argv) // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); // Initialize RHMC fermion operators DomainWallFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5); diff --git a/tests/debug/Test_reweight_dwf_eofa_gparity.cc b/tests/debug/Test_reweight_dwf_eofa_gparity.cc index fcc01b8d..df2d95a0 100644 --- a/tests/debug/Test_reweight_dwf_eofa_gparity.cc +++ b/tests/debug/Test_reweight_dwf_eofa_gparity.cc @@ -104,7 +104,7 @@ int main(int argc, char **argv) // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); // Initialize RHMC fermion operators GparityDomainWallFermionR::ImplParams params; diff --git a/tests/debug/Test_reweight_mobius_eofa.cc b/tests/debug/Test_reweight_mobius_eofa.cc index c5e46bcf..88ecab7d 100644 --- a/tests/debug/Test_reweight_mobius_eofa.cc +++ b/tests/debug/Test_reweight_mobius_eofa.cc @@ -104,7 +104,7 @@ int main(int argc, char **argv) // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); // Initialize RHMC fermion operators MobiusFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5, b, c); diff --git a/tests/debug/Test_reweight_mobius_eofa_gparity.cc b/tests/debug/Test_reweight_mobius_eofa_gparity.cc index bfc7543a..31708265 100644 --- a/tests/debug/Test_reweight_mobius_eofa_gparity.cc +++ b/tests/debug/Test_reweight_mobius_eofa_gparity.cc @@ -106,7 +106,7 @@ int main(int argc, char **argv) // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); // Initialize RHMC fermion operators GparityDomainWallFermionR::ImplParams params; diff --git a/tests/forces/Test_contfrac_force.cc b/tests/forces/Test_contfrac_force.cc index cb30faad..dc9eedce 100644 --- a/tests/forces/Test_contfrac_force.cc +++ b/tests/forces/Test_contfrac_force.cc @@ -59,7 +59,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -93,7 +93,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_dwf_force.cc b/tests/forces/Test_dwf_force.cc index 81a1b8c4..e7d17347 100644 --- a/tests/forces/Test_dwf_force.cc +++ b/tests/forces/Test_dwf_force.cc @@ -60,7 +60,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -94,7 +94,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_dwf_force_eofa.cc b/tests/forces/Test_dwf_force_eofa.cc index 0b0ba346..80d36934 100644 --- a/tests/forces/Test_dwf_force_eofa.cc +++ b/tests/forces/Test_dwf_force_eofa.cc @@ -72,7 +72,7 @@ int main (int argc, char** argv) LatticeFermion MphiPrime (FGrid); LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -105,7 +105,7 @@ int main (int argc, char** argv) for(int mu=0; mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom, mommu, mu); diff --git a/tests/forces/Test_dwf_gpforce.cc b/tests/forces/Test_dwf_gpforce.cc index b39fdd14..28133cc6 100644 --- a/tests/forces/Test_dwf_gpforce.cc +++ b/tests/forces/Test_dwf_gpforce.cc @@ -63,8 +63,8 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); - // SU3::ColdConfiguration(pRNG,U); + SU::HotConfiguration(RNG4,U); + // SU::ColdConfiguration(pRNG,U); //////////////////////////////////// // Unmodified matrix element @@ -112,7 +112,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg Hmom -= real(sum(trace(mommu*mommu))); diff --git a/tests/forces/Test_dwf_gpforce_eofa.cc b/tests/forces/Test_dwf_gpforce_eofa.cc index 58258a5e..7e480e7a 100644 --- a/tests/forces/Test_dwf_gpforce_eofa.cc +++ b/tests/forces/Test_dwf_gpforce_eofa.cc @@ -75,7 +75,7 @@ int main (int argc, char** argv) FermionField MphiPrime (FGrid); LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -109,7 +109,7 @@ int main (int argc, char** argv) for(int mu=0; mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom, mommu, mu); diff --git a/tests/forces/Test_gp_plaq_force.cc b/tests/forces/Test_gp_plaq_force.cc index 21f0b9d0..bc2b5b26 100644 --- a/tests/forces/Test_gp_plaq_force.cc +++ b/tests/forces/Test_gp_plaq_force.cc @@ -51,7 +51,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(&Grid); - SU3::HotConfiguration(pRNG,U); + SU::HotConfiguration(pRNG,U); double beta = 1.0; ConjugateWilsonGaugeActionR Action(beta); @@ -80,7 +80,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_gp_rect_force.cc b/tests/forces/Test_gp_rect_force.cc index bb4ea6de..98ebb2fa 100644 --- a/tests/forces/Test_gp_rect_force.cc +++ b/tests/forces/Test_gp_rect_force.cc @@ -54,7 +54,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(&Grid); - SU3::HotConfiguration(pRNG,U); + SU::HotConfiguration(pRNG,U); double beta = 1.0; double c1 = 0.331; @@ -82,7 +82,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_gpdwf_force.cc b/tests/forces/Test_gpdwf_force.cc index bdc332d9..d6744080 100644 --- a/tests/forces/Test_gpdwf_force.cc +++ b/tests/forces/Test_gpdwf_force.cc @@ -63,7 +63,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -100,7 +100,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_gpwilson_force.cc b/tests/forces/Test_gpwilson_force.cc index 1c85a5d9..d731f27a 100644 --- a/tests/forces/Test_gpwilson_force.cc +++ b/tests/forces/Test_gpwilson_force.cc @@ -57,7 +57,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -94,7 +94,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_laplacian_force.cc b/tests/forces/Test_laplacian_force.cc index 639378dc..18508860 100644 --- a/tests/forces/Test_laplacian_force.cc +++ b/tests/forces/Test_laplacian_force.cc @@ -58,7 +58,7 @@ int main (int argc, char ** argv) PokeIndex(P, P_mu, mu); } - SU3::HotConfiguration(pRNG,U); + SU::HotConfiguration(pRNG,U); ConjugateGradient CG(1.0e-8, 10000); @@ -95,7 +95,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "Update the U " << std::endl; for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); auto Umu = PeekIndex(U, mu); PokeIndex(mom,mommu,mu); Umu = expMat(mommu, dt, 12) * Umu; diff --git a/tests/forces/Test_mobius_force.cc b/tests/forces/Test_mobius_force.cc index 11e69652..ba7bc363 100644 --- a/tests/forces/Test_mobius_force.cc +++ b/tests/forces/Test_mobius_force.cc @@ -60,7 +60,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -96,7 +96,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_mobius_force_eofa.cc b/tests/forces/Test_mobius_force_eofa.cc index f85501fa..28523e9c 100644 --- a/tests/forces/Test_mobius_force_eofa.cc +++ b/tests/forces/Test_mobius_force_eofa.cc @@ -72,7 +72,7 @@ int main (int argc, char** argv) LatticeFermion MphiPrime (FGrid); LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -107,7 +107,7 @@ int main (int argc, char** argv) for(int mu=0; mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom, mommu, mu); diff --git a/tests/forces/Test_mobius_gpforce_eofa.cc b/tests/forces/Test_mobius_gpforce_eofa.cc index 68163e63..9c80b2aa 100644 --- a/tests/forces/Test_mobius_gpforce_eofa.cc +++ b/tests/forces/Test_mobius_gpforce_eofa.cc @@ -76,7 +76,7 @@ int main (int argc, char** argv) FermionField MphiPrime (FGrid); LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -112,7 +112,7 @@ int main (int argc, char** argv) for(int mu=0; mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom, mommu, mu); autoView( U_v , U, CpuRead); diff --git a/tests/forces/Test_partfrac_force.cc b/tests/forces/Test_partfrac_force.cc index 17dce530..33f7b5fd 100644 --- a/tests/forces/Test_partfrac_force.cc +++ b/tests/forces/Test_partfrac_force.cc @@ -62,7 +62,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -96,7 +96,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_rect_force.cc b/tests/forces/Test_rect_force.cc index ed72f2c0..c9326f8d 100644 --- a/tests/forces/Test_rect_force.cc +++ b/tests/forces/Test_rect_force.cc @@ -54,7 +54,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(&Grid); - SU3::HotConfiguration(pRNG,U); + SU::HotConfiguration(pRNG,U); double beta = 1.0; double c1 = -0.331; @@ -82,7 +82,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_wilson_force.cc b/tests/forces/Test_wilson_force.cc index c8b3a7f4..b7bf1268 100644 --- a/tests/forces/Test_wilson_force.cc +++ b/tests/forces/Test_wilson_force.cc @@ -61,7 +61,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(&Grid); //SU2::HotConfiguration(pRNG,U); - SU3::ColdConfiguration(pRNG,U); + SU::ColdConfiguration(pRNG,U); //////////////////////////////////// // Unmodified matrix element @@ -98,7 +98,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); Hmom -= real(sum(trace(mommu*mommu))); diff --git a/tests/forces/Test_wilsonclover_force.cc b/tests/forces/Test_wilsonclover_force.cc index f26f0ac9..6a28e4e2 100644 --- a/tests/forces/Test_wilsonclover_force.cc +++ b/tests/forces/Test_wilsonclover_force.cc @@ -62,8 +62,8 @@ int main(int argc, char **argv) LatticeGaugeField U(&Grid); - SU3::HotConfiguration(pRNG, U); - //SU3::ColdConfiguration(pRNG, U);// Clover term Zero() + SU::HotConfiguration(pRNG, U); + //SU::ColdConfiguration(pRNG, U);// Clover term Zero() //////////////////////////////////// // Unmodified matrix element @@ -101,7 +101,7 @@ int main(int argc, char **argv) for (int mu = 0; mu < Nd; mu++) { // Traceless antihermitian momentum; gaussian in lie alg - SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); + SU::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); Hmom -= real(sum(trace(mommu * mommu))); PokeIndex(mom, mommu, mu); diff --git a/tests/forces/Test_zmobius_force.cc b/tests/forces/Test_zmobius_force.cc index e24ae601..89673bc7 100644 --- a/tests/forces/Test_zmobius_force.cc +++ b/tests/forces/Test_zmobius_force.cc @@ -59,7 +59,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -109,7 +109,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc index d9249e0d..3766e069 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc @@ -293,7 +293,7 @@ int main (int argc, char ** argv) { { std::vector seeds4({1,2,3,4}); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); } std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << " Ls: " << Ls << std::endl; diff --git a/tests/lanczos/Test_dwf_lanczos.cc b/tests/lanczos/Test_dwf_lanczos.cc index 12283921..00d29ec0 100644 --- a/tests/lanczos/Test_dwf_lanczos.cc +++ b/tests/lanczos/Test_dwf_lanczos.cc @@ -54,7 +54,7 @@ int main (int argc, char ** argv) GridParallelRNG RNG5rb(FrbGrid); RNG5.SeedFixedIntegers(seeds5); LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); std::vector U(4,UGrid); for(int mu=0;mu::HotConfiguration(RNG4, Umu); /* std::vector U(4, UGrid); diff --git a/tests/qdpxx/Test_qdpxx_baryon.cc b/tests/qdpxx/Test_qdpxx_baryon.cc index a1d8f738..d8225f82 100644 --- a/tests/qdpxx/Test_qdpxx_baryon.cc +++ b/tests/qdpxx/Test_qdpxx_baryon.cc @@ -280,7 +280,7 @@ void make_gauge(GaugeField &Umu, Grid::LatticePropagator &q1,Grid::LatticePropag Grid::GridCartesian *UGrid = (Grid::GridCartesian *)Umu.Grid(); Grid::GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - Grid::SU3::HotConfiguration(RNG4, Umu); + Grid::SU::HotConfiguration(RNG4, Umu); // Propagator Grid::gaussian(RNG4, q1); diff --git a/tests/qdpxx/Test_qdpxx_loops_staples.cc b/tests/qdpxx/Test_qdpxx_loops_staples.cc index bbb41f4e..33057eeb 100644 --- a/tests/qdpxx/Test_qdpxx_loops_staples.cc +++ b/tests/qdpxx/Test_qdpxx_loops_staples.cc @@ -277,7 +277,7 @@ double calc_grid_p(Grid::LatticeGaugeField & Umu) Grid::GridCartesian * UGrid = (Grid::GridCartesian *) Umu.Grid(); Grid::GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - Grid::SU3::HotConfiguration(RNG4,Umu); + Grid::SU::HotConfiguration(RNG4,Umu); Grid::LatticeColourMatrix tmp(UGrid); tmp = Grid::zero; diff --git a/tests/qdpxx/Test_qdpxx_munprec.cc b/tests/qdpxx/Test_qdpxx_munprec.cc index fbc1ec82..82874546 100644 --- a/tests/qdpxx/Test_qdpxx_munprec.cc +++ b/tests/qdpxx/Test_qdpxx_munprec.cc @@ -502,7 +502,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF Grid::gaussian(RNG5,src); Grid::gaussian(RNG5,res); - Grid::SU3::HotConfiguration(RNG4,Umu); + Grid::SU::HotConfiguration(RNG4,Umu); /* Grid::LatticeColourMatrix U(UGrid); diff --git a/tests/qdpxx/Test_qdpxx_stag.cc b/tests/qdpxx/Test_qdpxx_stag.cc index f283d5a9..8f81fa99 100644 --- a/tests/qdpxx/Test_qdpxx_stag.cc +++ b/tests/qdpxx/Test_qdpxx_stag.cc @@ -333,7 +333,7 @@ void make_gauge(GaugeField & Umu,FermionField &src) Grid::GridCartesian * UGrid = (Grid::GridCartesian *) Umu.Grid(); Grid::GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - Grid::SU3::HotConfiguration(RNG4,Umu); + Grid::SU::HotConfiguration(RNG4,Umu); Grid::gaussian(RNG4,src); } diff --git a/tests/qdpxx/Test_qdpxx_wilson.cc b/tests/qdpxx/Test_qdpxx_wilson.cc index fdf59982..8ce28dca 100644 --- a/tests/qdpxx/Test_qdpxx_wilson.cc +++ b/tests/qdpxx/Test_qdpxx_wilson.cc @@ -348,7 +348,7 @@ void make_gauge(GaugeField &Umu, FermionField &src) Grid::GridCartesian *UGrid = (Grid::GridCartesian *)Umu._grid; Grid::GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - Grid::SU3::HotConfiguration(RNG4, Umu); + Grid::SU::HotConfiguration(RNG4, Umu); // Fermion field Grid::gaussian(RNG4, src); diff --git a/tests/smearing/Test_smearing.cc b/tests/smearing/Test_smearing.cc index c1c7c457..adab1c6e 100644 --- a/tests/smearing/Test_smearing.cc +++ b/tests/smearing/Test_smearing.cc @@ -47,8 +47,8 @@ int main (int argc, char ** argv) RealD nrm = norm2(src); LatticeFermion result(&Grid); result=Zero(); LatticeGaugeField Umu(&Grid); - // SU3::HotConfiguration(pRNG,Umu); - SU3::ColdConfiguration(Umu); + // SU::HotConfiguration(pRNG,Umu); + SU::ColdConfiguration(Umu); std::vector U(4,&Grid); for(int mu=0;mu::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); for(int mu=0;mu::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); RealD mass=0.1; diff --git a/tests/solver/Test_dwf_cg_prec.cc b/tests/solver/Test_dwf_cg_prec.cc index cb53894f..debb736a 100644 --- a/tests/solver/Test_dwf_cg_prec.cc +++ b/tests/solver/Test_dwf_cg_prec.cc @@ -67,7 +67,7 @@ int main(int argc, char** argv) { result = Zero(); LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << " Ls: " << Ls << std::endl; diff --git a/tests/solver/Test_dwf_cg_schur.cc b/tests/solver/Test_dwf_cg_schur.cc index 6216c366..6541e73d 100644 --- a/tests/solver/Test_dwf_cg_schur.cc +++ b/tests/solver/Test_dwf_cg_schur.cc @@ -61,7 +61,7 @@ int main (int argc, char ** argv) LatticeFermion src(FGrid); random(RNG5,src); LatticeFermion result(FGrid); result=Zero(); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); for(int mu=0;mu::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); for(int mu=0;mu::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); diff --git a/tests/solver/Test_dwf_fpgcr.cc b/tests/solver/Test_dwf_fpgcr.cc index 156f678a..42cc8de1 100644 --- a/tests/solver/Test_dwf_fpgcr.cc +++ b/tests/solver/Test_dwf_fpgcr.cc @@ -68,7 +68,7 @@ int main (int argc, char ** argv) LatticeFermion result(FGrid); result=Zero(); LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4,Umu); + SU::HotConfiguration(RNG4,Umu); ConjugateResidual CR(1.0e-6,10000); diff --git a/tests/solver/Test_dwf_mrhs_cg.cc b/tests/solver/Test_dwf_mrhs_cg.cc index 982a8247..b912ba4f 100644 --- a/tests/solver/Test_dwf_mrhs_cg.cc +++ b/tests/solver/Test_dwf_mrhs_cg.cc @@ -93,7 +93,7 @@ int main (int argc, char ** argv) for(int s=0;s::HotConfiguration(pRNG,Umu); /////////////////////////////////////////////////////////////// // Bounce these fields to disk diff --git a/tests/solver/Test_dwf_mrhs_cg_mpi.cc b/tests/solver/Test_dwf_mrhs_cg_mpi.cc index 8ace9b43..d0a32460 100644 --- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc +++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc @@ -136,11 +136,11 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "Intialising 4D RNG "<::HotConfiguration(pRNG,Umu); std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<::ColdConfiguration(Umu); std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<::HotConfiguration(pRNG,Umu); ///////////////// // MPI only sends diff --git a/tests/solver/Test_dwf_qmr_unprec.cc b/tests/solver/Test_dwf_qmr_unprec.cc index ba44ee93..370e7409 100644 --- a/tests/solver/Test_dwf_qmr_unprec.cc +++ b/tests/solver/Test_dwf_qmr_unprec.cc @@ -51,7 +51,7 @@ int main (int argc, char ** argv) LatticeFermion src(FGrid); random(RNG5,src); LatticeFermion result(FGrid); result=Zero(); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); diff --git a/tests/solver/Test_mobius_bcg.cc b/tests/solver/Test_mobius_bcg.cc index 8b34a6a5..8092d61c 100644 --- a/tests/solver/Test_mobius_bcg.cc +++ b/tests/solver/Test_mobius_bcg.cc @@ -128,7 +128,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "Intialising 4D RNG "<::HotConfiguration(pRNG,Umu); std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<::HotConfiguration(pRNG,Umu); std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<::ColdConfiguration(Umu); std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<::HotConfiguration(pRNG,Umu); std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<::ColdConfiguration(Umu); std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<::HotConfiguration(pRNG,Umu); std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<::ColdConfiguration(Umu); std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<::HotConfiguration(pRNG,Umu); ///////////////// // MPI only sends diff --git a/tests/solver/Test_staggered_block_cg_prec.cc b/tests/solver/Test_staggered_block_cg_prec.cc index 2499fc8a..c5306e85 100644 --- a/tests/solver/Test_staggered_block_cg_prec.cc +++ b/tests/solver/Test_staggered_block_cg_prec.cc @@ -87,7 +87,7 @@ int main (int argc, char ** argv) FermionField result_o(FrbGrid); result_o=Zero(); RealD nrm = norm2(src); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(UGrid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); RealD mass=0.003; RealD c1=9.0/8.0; diff --git a/tests/solver/Test_staggered_cagmres_unprec.cc b/tests/solver/Test_staggered_cagmres_unprec.cc index 8121c90d..1b7a2f56 100644 --- a/tests/solver/Test_staggered_cagmres_unprec.cc +++ b/tests/solver/Test_staggered_cagmres_unprec.cc @@ -51,7 +51,7 @@ int main (int argc, char ** argv) FermionField src(&Grid); random(pRNG,src); RealD nrm = norm2(src); FermionField result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); FermionField src(&Grid); random(pRNG,src); FermionField result(&Grid); result=Zero(); diff --git a/tests/solver/Test_staggered_cg_unprec.cc b/tests/solver/Test_staggered_cg_unprec.cc index 9625a9c8..e023b910 100644 --- a/tests/solver/Test_staggered_cg_unprec.cc +++ b/tests/solver/Test_staggered_cg_unprec.cc @@ -65,7 +65,7 @@ int main (int argc, char ** argv) FermionField src(&Grid); random(pRNG,src); RealD nrm = norm2(src); FermionField result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); diff --git a/tests/solver/Test_wilson_cg_schur.cc b/tests/solver/Test_wilson_cg_schur.cc index 23383032..97482131 100644 --- a/tests/solver/Test_wilson_cg_schur.cc +++ b/tests/solver/Test_wilson_cg_schur.cc @@ -57,7 +57,7 @@ int main (int argc, char ** argv) std::vector seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); LatticeFermion src(&Grid); random(pRNG,src); LatticeFermion result(&Grid); result=Zero(); diff --git a/tests/solver/Test_wilson_cg_unprec.cc b/tests/solver/Test_wilson_cg_unprec.cc index f3335d45..07f6ba7b 100644 --- a/tests/solver/Test_wilson_cg_unprec.cc +++ b/tests/solver/Test_wilson_cg_unprec.cc @@ -60,7 +60,7 @@ int main (int argc, char ** argv) LatticeFermion src(&Grid); random(pRNG,src); RealD nrm = norm2(src); LatticeFermion result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); diff --git a/tests/solver/Test_wilson_fcagmres_prec.cc b/tests/solver/Test_wilson_fcagmres_prec.cc index b821a25f..d2a1acf4 100644 --- a/tests/solver/Test_wilson_fcagmres_prec.cc +++ b/tests/solver/Test_wilson_fcagmres_prec.cc @@ -47,7 +47,7 @@ int main (int argc, char ** argv) LatticeFermion src(&Grid); random(pRNG,src); RealD nrm = norm2(src); LatticeFermion result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(fPRNG, Umu); // clang-format on RealD mass = -0.25; diff --git a/tests/solver/Test_wilson_mg_mp.cc b/tests/solver/Test_wilson_mg_mp.cc index e631cd15..89bbbf74 100644 --- a/tests/solver/Test_wilson_mg_mp.cc +++ b/tests/solver/Test_wilson_mg_mp.cc @@ -52,7 +52,7 @@ int main(int argc, char **argv) { LatticeFermionD src_d(FGrid_d); gaussian(fPRNG, src_d); LatticeFermionD resultMGD_d(FGrid_d); resultMGD_d = Zero(); LatticeFermionD resultMGF_d(FGrid_d); resultMGF_d = Zero(); - LatticeGaugeFieldD Umu_d(FGrid_d); SU3::HotConfiguration(fPRNG, Umu_d); + LatticeGaugeFieldD Umu_d(FGrid_d); SU::HotConfiguration(fPRNG, Umu_d); LatticeGaugeFieldF Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d); // clang-format on diff --git a/tests/solver/Test_wilson_mr_unprec.cc b/tests/solver/Test_wilson_mr_unprec.cc index 1cc1f418..fef83794 100644 --- a/tests/solver/Test_wilson_mr_unprec.cc +++ b/tests/solver/Test_wilson_mr_unprec.cc @@ -47,7 +47,7 @@ int main (int argc, char ** argv) LatticeFermion src(&Grid); random(pRNG,src); RealD nrm = norm2(src); LatticeFermion result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(RNG4,Umu); std::vector U(4,Grid); diff --git a/tests/solver/Test_wilsonclover_bicgstab_prec.cc b/tests/solver/Test_wilsonclover_bicgstab_prec.cc index c1905400..b382b1bb 100644 --- a/tests/solver/Test_wilsonclover_bicgstab_prec.cc +++ b/tests/solver/Test_wilsonclover_bicgstab_prec.cc @@ -60,7 +60,7 @@ int main (int argc, char ** argv) LatticeFermion src(&Grid); random(pRNG,src); RealD nrm = norm2(src); LatticeFermion result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); diff --git a/tests/solver/Test_wilsonclover_cg_schur.cc b/tests/solver/Test_wilsonclover_cg_schur.cc index eaae24b3..567a8283 100644 --- a/tests/solver/Test_wilsonclover_cg_schur.cc +++ b/tests/solver/Test_wilsonclover_cg_schur.cc @@ -57,7 +57,7 @@ int main (int argc, char ** argv) std::vector seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); LatticeFermion src(&Grid); random(pRNG,src); LatticeFermion result(&Grid); result=Zero(); diff --git a/tests/solver/Test_wilsonclover_cg_unprec.cc b/tests/solver/Test_wilsonclover_cg_unprec.cc index 49c52cdf..755d80e1 100644 --- a/tests/solver/Test_wilsonclover_cg_unprec.cc +++ b/tests/solver/Test_wilsonclover_cg_unprec.cc @@ -60,7 +60,7 @@ int main (int argc, char ** argv) LatticeFermion src(&Grid); random(pRNG,src); RealD nrm = norm2(src); LatticeFermion result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(fPRNG, Umu); // clang-format on RealD mass = -0.25; diff --git a/tests/solver/Test_wilsonclover_mg_lime.cc b/tests/solver/Test_wilsonclover_mg_lime.cc index bd2990d4..0a29c034 100644 --- a/tests/solver/Test_wilsonclover_mg_lime.cc +++ b/tests/solver/Test_wilsonclover_mg_lime.cc @@ -75,7 +75,7 @@ int main(int argc, char **argv) { NerscIO::readConfiguration(Umu_d,header,file); } #endif - // SU3::HotConfiguration(fPRNG, Umu_d); + // SU::HotConfiguration(fPRNG, Umu_d); LatticeGaugeFieldF Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d); // clang-format on diff --git a/tests/solver/Test_wilsonclover_mg_mp.cc b/tests/solver/Test_wilsonclover_mg_mp.cc index b5178d2e..2efe5f08 100644 --- a/tests/solver/Test_wilsonclover_mg_mp.cc +++ b/tests/solver/Test_wilsonclover_mg_mp.cc @@ -52,7 +52,7 @@ int main(int argc, char **argv) { LatticeFermionD src_d(FGrid_d); gaussian(fPRNG, src_d); LatticeFermionD resultMGD_d(FGrid_d); resultMGD_d = zero; LatticeFermionD resultMGF_d(FGrid_d); resultMGF_d = zero; - LatticeGaugeFieldD Umu_d(FGrid_d); SU3::HotConfiguration(fPRNG, Umu_d); + LatticeGaugeFieldD Umu_d(FGrid_d); SU::HotConfiguration(fPRNG, Umu_d); LatticeGaugeFieldF Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d); // clang-format on diff --git a/tests/solver/Test_wilsonclover_mixedbicgstab_prec.cc b/tests/solver/Test_wilsonclover_mixedbicgstab_prec.cc index 0af83f8b..d47dac2a 100644 --- a/tests/solver/Test_wilsonclover_mixedbicgstab_prec.cc +++ b/tests/solver/Test_wilsonclover_mixedbicgstab_prec.cc @@ -61,7 +61,7 @@ int main (int argc, char ** argv) // clang-format off LatticeFermionD src(FGrid_d); gaussian(fPRNG, src); LatticeFermionD result(FGrid_d); result = Zero(); - LatticeGaugeFieldD Umu_d(FGrid_d); SU3::HotConfiguration(fPRNG, Umu_d); + LatticeGaugeFieldD Umu_d(FGrid_d); SU::HotConfiguration(fPRNG, Umu_d); LatticeGaugeFieldF Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d); // clang-format on diff --git a/tests/solver/Test_wilsonclover_mixedcg_prec.cc b/tests/solver/Test_wilsonclover_mixedcg_prec.cc index 8af9036f..95590004 100644 --- a/tests/solver/Test_wilsonclover_mixedcg_prec.cc +++ b/tests/solver/Test_wilsonclover_mixedcg_prec.cc @@ -61,7 +61,7 @@ int main (int argc, char ** argv) // clang-format off LatticeFermionD src(FGrid_d); gaussian(fPRNG, src); LatticeFermionD result(FGrid_d); result = Zero(); - LatticeGaugeFieldD Umu_d(FGrid_d); SU3::HotConfiguration(fPRNG, Umu_d); + LatticeGaugeFieldD Umu_d(FGrid_d); SU::HotConfiguration(fPRNG, Umu_d); LatticeGaugeFieldF Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d); // clang-format on diff --git a/tests/solver/Test_wilsonclover_mr_unprec.cc b/tests/solver/Test_wilsonclover_mr_unprec.cc index c7b5ecfe..ab49ec1f 100644 --- a/tests/solver/Test_wilsonclover_mr_unprec.cc +++ b/tests/solver/Test_wilsonclover_mr_unprec.cc @@ -51,7 +51,7 @@ int main (int argc, char ** argv) FermionField src(&Grid); random(pRNG,src); RealD nrm = norm2(src); FermionField result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(RNG4, Umu); } std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() diff --git a/tests/solver/Test_zmobius_cg_prec.cc b/tests/solver/Test_zmobius_cg_prec.cc index fb57cff1..6b007afc 100644 --- a/tests/solver/Test_zmobius_cg_prec.cc +++ b/tests/solver/Test_zmobius_cg_prec.cc @@ -67,7 +67,7 @@ int main(int argc, char** argv) { result = Zero(); LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << " Ls: " << Ls << std::endl; From 6b1486e89b8d97470e8b50657e158618a67fb392 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Thu, 8 Oct 2020 16:31:24 +0100 Subject: [PATCH 020/201] fixing number of colours defaulting to 4 in most cases --- configure.ac | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index c1dc09d4..cee2a84c 100644 --- a/configure.ac +++ b/configure.ac @@ -130,9 +130,9 @@ AC_ARG_ENABLE([Nc], case ${ac_Nc} in 2) - AC_DEFINE([Config_Nc],[4],[Gauge group Nc]);; + AC_DEFINE([Config_Nc],[2],[Gauge group Nc]);; 3) - AC_DEFINE([Config_Nc],[4],[Gauge group Nc]);; + AC_DEFINE([Config_Nc],[3],[Gauge group Nc]);; 4) AC_DEFINE([Config_Nc],[4],[Gauge group Nc]);; 5) From 5f0fe029d213bbab483cb3d48076ecce22790751 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 8 Oct 2020 19:51:28 -0400 Subject: [PATCH 021/201] Improve meemory benchmarks for GPU (avoid host mem ping pong) --- benchmarks/Benchmark_ITT.cc | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index df5427c1..0ddf5068 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -125,7 +125,7 @@ public: lat*mpi_layout[1], lat*mpi_layout[2], lat*mpi_layout[3]}); - std::cout << GridLogMessage<< latt_size <({45,12,81,9})); for(int lat=8;lat<=lmax;lat+=8){ @@ -249,11 +249,6 @@ public: double start=usecond(); for(int i=0;i({45,12,81,9})); for(int lat=8;lat<=lmax;lat+=8){ @@ -309,11 +304,6 @@ public: double start=usecond(); for(int i=0;i Date: Thu, 8 Oct 2020 19:52:08 -0400 Subject: [PATCH 022/201] Single prec benchmark in double prec compile --- benchmarks/Benchmark_dwf_fp32.cc | 364 +++++++++++++++++++++++++++++++ 1 file changed, 364 insertions(+) create mode 100644 benchmarks/Benchmark_dwf_fp32.cc diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc new file mode 100644 index 00000000..cb86177e --- /dev/null +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -0,0 +1,364 @@ + /************************************************************************************* + Grid physics library, www.github.com/paboyle/Grid + Source file: ./benchmarks/Benchmark_dwf.cc + Copyright (C) 2015 + + Author: Peter Boyle + Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#ifdef GRID_CUDA +#define CUDA_PROFILE +#endif + +#ifdef CUDA_PROFILE +#include +#endif + +using namespace std; +using namespace Grid; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + int threads = GridThread::GetThreads(); + + Coordinate latt4 = GridDefaultLatt(); + int Ls=8; + for(int i=0;i> Ls; + } + + GridLogLayout(); + + long unsigned int single_site_flops = 8*Nc*(7+16*Nc); + + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making s innermost grids"< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionF src (FGrid); random(RNG5,src); +#if 0 + src = Zero(); + { + Coordinate origin({0,0,0,latt4[2]-1,0}); + SpinColourVectorF tmp; + tmp=Zero(); + tmp()(0)(0)=Complex(-2.0,0.0); + std::cout << " source site 0 " << tmp<::HotConfiguration(RNG4,Umu); + std::cout << GridLogMessage << "Random gauge initialised " << std::endl; +#if 0 + Umu=1.0; + for(int mu=0;mu(Umu,mu); + // if (mu !=2 ) ttmp = 0; + // ttmp = ttmp* pow(10.0,mu); + PokeIndex(Umu,ttmp,mu); + } + std::cout << GridLogMessage << "Forced to diagonal " << std::endl; +#endif + + //////////////////////////////////// + // Naive wilson implementation + //////////////////////////////////// + // replicate across fifth dimension + LatticeGaugeFieldF Umu5d(FGrid); + std::vector U(4,FGrid); + { + autoView( Umu5d_v, Umu5d, CpuWrite); + autoView( Umu_v , Umu , CpuRead); + for(int ss=0;ssoSites();ss++){ + for(int s=0;s(Umu5d,mu); + } + std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; + + if (1) + { + ref = Zero(); + for(int mu=0;mu_Nprocessors; + RealD NN = UGrid->NodeCount(); + + std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); + Dw.ZeroCounters(); + Dw.Dhop(src,result,0); + std::cout<Barrier(); + + double volume=Ls; for(int mu=0;mu1.0e-4) ) { + /* + std::cout << "RESULT\n " << result<Barrier(); + exit(-1); + } + assert (norm2(err)< 1.0e-4 ); + Dw.Report(); + } + + if (1) + { // Naive wilson dag implementation + ref = Zero(); + for(int mu=0;mu1.0e-4)){ +/* + std::cout<< "DAG RESULT\n " <Barrier(); + Dw.DhopEO(src_o,r_e,DaggerNo); + double t0=usecond(); + for(int i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mu1.0e-4)){ + /* + std::cout<< "Deo RESULT\n " < Date: Thu, 8 Oct 2020 22:19:20 -0400 Subject: [PATCH 023/201] more runtime --- benchmarks/Benchmark_ITT.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 0ddf5068..eb275728 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -422,7 +422,7 @@ public: } FGrid->Barrier(); double t1=usecond(); - uint64_t ncall = 50; + uint64_t ncall = 500; FGrid->Broadcast(0,&ncall,sizeof(ncall)); From b24a504d7c8f201c7689f4d05ece21e20eced345 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Fri, 9 Oct 2020 20:28:54 +0100 Subject: [PATCH 024/201] hook to access last parallel I/O performance measurement --- Grid/parallelIO/BinaryIO.cc | 3 ++- Grid/parallelIO/BinaryIO.h | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/Grid/parallelIO/BinaryIO.cc b/Grid/parallelIO/BinaryIO.cc index 221a7fe8..ef1b6683 100644 --- a/Grid/parallelIO/BinaryIO.cc +++ b/Grid/parallelIO/BinaryIO.cc @@ -1,3 +1,4 @@ #include -int Grid::BinaryIO::latticeWriteMaxRetry = -1; +int Grid::BinaryIO::latticeWriteMaxRetry = -1; +Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf; diff --git a/Grid/parallelIO/BinaryIO.h b/Grid/parallelIO/BinaryIO.h index 1f11add9..e390b575 100644 --- a/Grid/parallelIO/BinaryIO.h +++ b/Grid/parallelIO/BinaryIO.h @@ -79,6 +79,13 @@ inline void removeWhitespace(std::string &key) /////////////////////////////////////////////////////////////////////////////////////////////////// class BinaryIO { public: + struct IoPerf + { + uint64_t size{0},time{0}; + double mbytesPerSecond{0.}; + }; + + static IoPerf lastPerf; static int latticeWriteMaxRetry; ///////////////////////////////////////////////////////////////////////////// @@ -502,12 +509,15 @@ class BinaryIO { timer.Stop(); } + lastPerf.size = sizeof(fobj)*iodata.size()*nrank; + lastPerf.time = timer.useconds(); + lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6); std::cout< Date: Fri, 9 Oct 2020 20:29:40 +0100 Subject: [PATCH 025/201] multi-pass I/O benchmark, with statistic and robustness summary --- benchmarks/Benchmark_IO.cc | 202 ++++++++++++++++++++++++++++-------- benchmarks/Benchmark_IO.hpp | 54 +++++++--- 2 files changed, 198 insertions(+), 58 deletions(-) diff --git a/benchmarks/Benchmark_IO.cc b/benchmarks/Benchmark_IO.cc index 5e4cef9f..0393257d 100644 --- a/benchmarks/Benchmark_IO.cc +++ b/benchmarks/Benchmark_IO.cc @@ -1,8 +1,16 @@ #include "Benchmark_IO.hpp" +#ifndef BENCH_IO_LMIN +#define BENCH_IO_LMIN 20 +#endif + #ifndef BENCH_IO_LMAX -#define BENCH_IO_LMAX 40 +#define BENCH_IO_LMAX 30 +#endif + +#ifndef BENCH_IO_NPASS +#define BENCH_IO_NPASS 10 #endif using namespace Grid; @@ -12,62 +20,174 @@ std::string filestem(const int l) return "iobench_l" + std::to_string(l); } +int vol(const int i) +{ + return BENCH_IO_LMIN + 2*i; +} + +int volInd(const int l) +{ + return (l - BENCH_IO_LMIN)/2; +} + +template +void stats(Mat &mean, Mat &stdDev, const std::vector &data) +{ + auto nr = data[0].rows(), nc = data[0].cols(); + Eigen::MatrixXd sqSum(nr, nc); + double n = static_cast(data.size()); + + assert(n > 1.); + mean = Mat::Zero(nr, nc); + sqSum = Mat::Zero(nr, nc); + for (auto &d: data) + { + mean += d; + sqSum += d.cwiseProduct(d); + } + stdDev = ((sqSum - mean.cwiseProduct(mean)/n)/(n - 1.)).cwiseSqrt(); + mean /= n; +} + +#define grid_printf(...) \ +MSG << "";\ +printf(__VA_ARGS__); + +enum {sRead = 0, sWrite = 1, gRead = 2, gWrite = 3}; + int main (int argc, char ** argv) { Grid_init(&argc,&argv); - int64_t threads = GridThread::GetThreads(); - auto mpi = GridDefaultMpi(); - std::vector latt; + int64_t threads = GridThread::GetThreads(); + auto mpi = GridDefaultMpi(); + unsigned int nVol = (BENCH_IO_LMAX - BENCH_IO_LMIN)/2 + 1; + unsigned int nRelVol = (BENCH_IO_LMAX - 24)/2 + 1; + std::vector perf(BENCH_IO_NPASS, Eigen::MatrixXd::Zero(nVol, 4)); + std::vector avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4)); + std::vector latt; MSG << "Grid is setup to use " << threads << " threads" << std::endl; MSG << "MPI partition " << mpi << std::endl; - - MSG << SEP << std::endl; - MSG << "Benchmark std write" << std::endl; - MSG << SEP << std::endl; - for (int l = 4; l <= BENCH_IO_LMAX; l += 2) + for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i) { - latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + MSG << BIGSEP << std::endl; + MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl; + MSG << BIGSEP << std::endl; + MSG << SEP << std::endl; + MSG << "Benchmark std write" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; - MSG << "-- Local volume " << l << "^4" << std::endl; - writeBenchmark(latt, filestem(l), stdWrite); - } + MSG << "-- Local volume " << l << "^4" << std::endl; + writeBenchmark(latt, filestem(l), stdWrite); + perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond; + } - MSG << SEP << std::endl; - MSG << "Benchmark std read" << std::endl; - MSG << SEP << std::endl; - for (int l = 4; l <= BENCH_IO_LMAX; l += 2) - { - latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + MSG << SEP << std::endl; + MSG << "Benchmark std read" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; - MSG << "-- Local volume " << l << "^4" << std::endl; - readBenchmark(latt, filestem(l), stdRead); - } + MSG << "-- Local volume " << l << "^4" << std::endl; + readBenchmark(latt, filestem(l), stdRead); + perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond; + } -#ifdef HAVE_LIME - MSG << SEP << std::endl; - MSG << "Benchmark Grid C-Lime write" << std::endl; - MSG << SEP << std::endl; - for (int l = 4; l <= BENCH_IO_LMAX; l += 2) - { - latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + #ifdef HAVE_LIME + MSG << SEP << std::endl; + MSG << "Benchmark Grid C-Lime write" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; - MSG << "-- Local volume " << l << "^4" << std::endl; - writeBenchmark(latt, filestem(l), limeWrite); - } + MSG << "-- Local volume " << l << "^4" << std::endl; + writeBenchmark(latt, filestem(l), limeWrite); + perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond; + } - MSG << SEP << std::endl; - MSG << "Benchmark Grid C-Lime read" << std::endl; - MSG << SEP << std::endl; - for (int l = 4; l <= BENCH_IO_LMAX; l += 2) - { - latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + MSG << SEP << std::endl; + MSG << "Benchmark Grid C-Lime read" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; - MSG << "-- Local volume " << l << "^4" << std::endl; - readBenchmark(latt, filestem(l), limeRead); - } + MSG << "-- Local volume " << l << "^4" << std::endl; + readBenchmark(latt, filestem(l), limeRead); + perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond; + } #endif + avPerf[i].fill(0.); + for (int f = 0; f < 4; ++f) + for (int l = 24; l <= BENCH_IO_LMAX; l += 2) + { + avPerf[i](f) += perf[i](volInd(l), f); + } + avPerf[i] /= nRelVol; + } + + Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4); + Eigen::VectorXd avMean(4), avStdDev(4), avRob(4); + double n = BENCH_IO_NPASS; + + stats(mean, stdDev, perf); + stats(avMean, avStdDev, avPerf); + rob.fill(100.); + rob -= 100.*stdDev.cwiseQuotient(mean.cwiseAbs()); + avRob.fill(100.); + avRob -= 100.*avStdDev.cwiseQuotient(avMean.cwiseAbs()); + + MSG << BIGSEP << std::endl; + MSG << "SUMMARY" << std::endl; + MSG << BIGSEP << std::endl; + MSG << "Summary of individual results (all results in MB/s)." << std::endl; + MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; + MSG << std::endl; + grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", + "L", "std read", "std dev", "std write", "std dev", + "Grid read", "std dev", "Grid write", "std dev"); + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", + l, mean(volInd(l), sRead), stdDev(volInd(l), sRead), + mean(volInd(l), sWrite), stdDev(volInd(l), sWrite), + mean(volInd(l), gRead), stdDev(volInd(l), gRead), + mean(volInd(l), gWrite), stdDev(volInd(l), gWrite)); + } + MSG << std::endl; + MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl; + MSG << std::endl; + grid_printf("%4s %12s %12s %12s %12s\n", + "L", "std read", "std write", "Grid read", "Grid write"); + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", + l, rob(volInd(l), sRead), rob(volInd(l), sWrite), + rob(volInd(l), gRead), rob(volInd(l), gWrite)); + } + MSG << std::endl; + MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl; + MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; + MSG << std::endl; + grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", + "std read", "std dev", "std write", "std dev", + "Grid read", "std dev", "Grid write", "std dev"); + grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", + avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), + avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite)); + MSG << std::endl; + MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl; + MSG << std::endl; + grid_printf("%12s %12s %12s %12s\n", + "std read", "std write", "Grid read", "Grid write"); + grid_printf("%12.1f %12.1f %12.1f %12.1f\n", + avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite)); Grid_finalize(); diff --git a/benchmarks/Benchmark_IO.hpp b/benchmarks/Benchmark_IO.hpp index 39af14ba..c4a6ca58 100644 --- a/benchmarks/Benchmark_IO.hpp +++ b/benchmarks/Benchmark_IO.hpp @@ -5,6 +5,8 @@ #ifdef HAVE_LIME #define MSG std::cout << GridLogMessage #define SEP \ +"-----------------------------------------------------------------------------" +#define BIGSEP \ "=============================================================================" namespace Grid { @@ -37,9 +39,12 @@ using ReaderFn = std::function; // ioWatch.Stop(); // std::fclose(file); // size *= vec.Grid()->ProcessorCount(); -// MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() -// << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) -// << " MB/s" << std::endl; +// auto &p = BinaryIO::lastPerf; +// p.size = size; +// p.time = ioWatch.useconds(); +// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); +// MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() +// << ", " << p.mbytesPerSecond << " MB/s" << std::endl; // MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; // } // @@ -72,9 +77,12 @@ using ReaderFn = std::function; // MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl; // assert(crcData == crcRead); // size *= vec.Grid()->ProcessorCount(); -// MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() -// << ", performance " << size/1024./1024./(ioWatch.useconds()/1.e6) -// << " MB/s" << std::endl; +// auto &p = BinaryIO::lastPerf; +// p.size = size; +// p.time = ioWatch.useconds(); +// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); +// MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() +// << ", " << p.mbytesPerSecond << " MB/s" << std::endl; // MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; // } @@ -100,9 +108,12 @@ void stdWrite(const std::string filestem, Field &vec) file.flush(); ioWatch.Stop(); size *= vec.Grid()->ProcessorCount(); - MSG << "Std I/O write: Wrote " << size << " bytes in " << ioWatch.Elapsed() - << ", " << size/1024./1024./(ioWatch.useconds()/1.e6) - << " MB/s" << std::endl; + auto &p = BinaryIO::lastPerf; + p.size = size; + p.time = ioWatch.useconds(); + p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); + MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() + << ", " << p.mbytesPerSecond << " MB/s" << std::endl; MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; } @@ -135,9 +146,12 @@ void stdRead(Field &vec, const std::string filestem) MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl; assert(crcData == crcRead); size *= vec.Grid()->ProcessorCount(); - MSG << "Std I/O read: Read " << size << " bytes in " << ioWatch.Elapsed() - << ", " << size/1024./1024./(ioWatch.useconds()/1.e6) - << " MB/s" << std::endl; + auto &p = BinaryIO::lastPerf; + p.size = size; + p.time = ioWatch.useconds(); + p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); + MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() + << ", " << p.mbytesPerSecond << " MB/s" << std::endl; MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; } @@ -200,12 +214,18 @@ void writeBenchmark(const Coordinate &latt, const std::string filename, auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); std::shared_ptr gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); std::shared_ptr gPt; + std::random_device rd; makeGrid(gPt, gBasePt, Ls, rb); - GridBase *g = gPt.get(); - GridParallelRNG rng(g); - Field vec(g); + GridBase *g = gPt.get(); + GridParallelRNG rng(g); + Field vec(g); + + rng.SeedFixedIntegers({static_cast(rd()), static_cast(rd()), + static_cast(rd()), static_cast(rd()), + static_cast(rd()), static_cast(rd()), + static_cast(rd()), static_cast(rd())}); random(rng, vec); write(filename, vec); @@ -223,8 +243,8 @@ void readBenchmark(const Coordinate &latt, const std::string filename, makeGrid(gPt, gBasePt, Ls, rb); - GridBase *g = gPt.get(); - Field vec(g); + GridBase *g = gPt.get(); + Field vec(g); read(vec, filename); } From 0e17bd6597cb6e62af58394e4eac726910caa477 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Fri, 9 Oct 2020 20:29:57 +0100 Subject: [PATCH 026/201] I/O benchmark cleanup --- benchmarks/Benchmark_IO_vs_dir.cc | 4 -- benchmarks/benchmark-io-csv.sh | 76 ------------------------------- 2 files changed, 80 deletions(-) delete mode 100755 benchmarks/benchmark-io-csv.sh diff --git a/benchmarks/Benchmark_IO_vs_dir.cc b/benchmarks/Benchmark_IO_vs_dir.cc index 9ccfd554..e030bc39 100644 --- a/benchmarks/Benchmark_IO_vs_dir.cc +++ b/benchmarks/Benchmark_IO_vs_dir.cc @@ -1,9 +1,5 @@ #include "Benchmark_IO.hpp" -#define MSG std::cout << GridLogMessage -#define SEP \ -"=============================================================================" - using namespace Grid; int main (int argc, char ** argv) diff --git a/benchmarks/benchmark-io-csv.sh b/benchmarks/benchmark-io-csv.sh deleted file mode 100755 index cc61b006..00000000 --- a/benchmarks/benchmark-io-csv.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env bash - -awkscript=' -BEGIN{ - i = 0; - print "local L,std read (MB/s),std write (MB/s),Grid Lime read (MB/s),Grid Lime write (MB/s)" -} - -/Benchmark std write/{ - i = 0; - mode = "stdWrite"; -} - -/Benchmark std read/{ - i = 0; - mode = "stdRead" -} - -/Benchmark Grid C-Lime write/{ - i = 0; - mode = "gridWrite"; -} - -/Benchmark Grid C-Lime read/{ - i = 0; - mode = "gridRead"; -} - -/Local volume/{ - match($0, "[0-9]+\\^4"); - l[i] = substr($0, RSTART, RLENGTH-2); -} - -/MB\/s/{ - match($0, "[0-9.eE]+ MB/s"); - p = substr($0, RSTART, RLENGTH-5); - if (mode == "stdWrite") - { - sw[i] = p; - } - else if (mode == "stdRead") - { - sr[i] = p; - } - else if (mode == "gridWrite") - { - gw[i] = p; - } - else if (mode == "gridRead") - { - gr[i] = p; - } - i++; -} - -END{ - s = 0 - for (a in l) - { - s++; - } - for (j = 0; j < s; j++) - { - printf("%s,%s,%s,%s,%s\n", l[j], sr[j], sw[j], gr[j], gw[j]); - } - printf("\n"); -} -' - -if (( $# != 1 )); then - echo "usage: `basename $0` " 1>&2 - exit 1 -fi -LOG=$1 - -awk "${awkscript}" ${LOG} From 5f893bf9aff17781ce363512cccdd63b2e126b1a Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Fri, 9 Oct 2020 21:31:59 +0100 Subject: [PATCH 027/201] Benchmark_IO procurement sizes --- benchmarks/Benchmark_IO.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_IO.cc b/benchmarks/Benchmark_IO.cc index 0393257d..04e3a735 100644 --- a/benchmarks/Benchmark_IO.cc +++ b/benchmarks/Benchmark_IO.cc @@ -2,11 +2,11 @@ #include "Benchmark_IO.hpp" #ifndef BENCH_IO_LMIN -#define BENCH_IO_LMIN 20 +#define BENCH_IO_LMIN 8 #endif #ifndef BENCH_IO_LMAX -#define BENCH_IO_LMAX 30 +#define BENCH_IO_LMAX 48 #endif #ifndef BENCH_IO_NPASS From b0d61b9687d441bd651622970aadbf6200f16c22 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Fri, 9 Oct 2020 21:46:45 +0100 Subject: [PATCH 028/201] Benchmark_IO cleaner output --- benchmarks/Benchmark_IO.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_IO.cc b/benchmarks/Benchmark_IO.cc index 04e3a735..76a2375f 100644 --- a/benchmarks/Benchmark_IO.cc +++ b/benchmarks/Benchmark_IO.cc @@ -50,8 +50,11 @@ void stats(Mat &mean, Mat &stdDev, const std::vector &data) } #define grid_printf(...) \ -MSG << "";\ -printf(__VA_ARGS__); +{\ + char _buf[1024];\ + sprintf(_buf, __VA_ARGS__);\ + MSG << _buf;\ +} enum {sRead = 0, sWrite = 1, gRead = 2, gWrite = 3}; From c2b688abc949d67cff19ba296dbd3d02ea3ec8de Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Sat, 10 Oct 2020 16:52:56 +0100 Subject: [PATCH 029/201] Benchmark_IO: reducing max local volume to 32^4 --- benchmarks/Benchmark_IO.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_IO.cc b/benchmarks/Benchmark_IO.cc index 76a2375f..0d80d425 100644 --- a/benchmarks/Benchmark_IO.cc +++ b/benchmarks/Benchmark_IO.cc @@ -6,7 +6,7 @@ #endif #ifndef BENCH_IO_LMAX -#define BENCH_IO_LMAX 48 +#define BENCH_IO_LMAX 32 #endif #ifndef BENCH_IO_NPASS From d55cc5b3809986be1a194dba56b22544c797b2a5 Mon Sep 17 00:00:00 2001 From: Sam Mangham Date: Mon, 12 Oct 2020 12:33:13 +0100 Subject: [PATCH 030/201] Fixed typo on --enable-comm, removed all references to --enable-precision except for config options, where it is listed as deprecated. Removed travis test for single precision. --- .travis.yml | 7 +------ README | 33 +++++++++++------------------ README.md | 33 +++++++++++------------------ SVE_README.txt | 24 ++++++++++----------- documentation/GridXcode/readme.md | 8 +++---- documentation/manual.rst | 35 ++++++++++--------------------- 6 files changed, 52 insertions(+), 88 deletions(-) diff --git a/.travis.yml b/.travis.yml index 129fd582..3a0e1e35 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,11 +9,6 @@ matrix: - os: osx osx_image: xcode8.3 compiler: clang - env: PREC=single - - os: osx - osx_image: xcode8.3 - compiler: clang - env: PREC=double before_install: - export GRIDDIR=`pwd` @@ -55,7 +50,7 @@ script: - make -j4 - make install - cd $CWD/build - - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF} + - ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF} - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - make check diff --git a/README b/README index 86506f52..0beabff3 100644 --- a/README +++ b/README @@ -111,11 +111,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu ``` bash mkdir build; cd build -../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix= +../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix= ``` -where `--enable-precision=` set the default precision, -`--enable-simd=` set the SIMD type, `--enable- +where `--enable-simd=` set the SIMD type, `--enable- comms=`, and `` should be replaced by the prefix path where you want to install Grid. Other options are detailed in the next section, you can also use `configure --help` to display them. Like with any other program using GNU autotool, the @@ -146,8 +145,8 @@ If you want to build all the tests at once just use `make tests`. - `--enable-numa`: enable NUMA first touch optimisation - `--enable-simd=`: setup Grid for the SIMD target `` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. - `--enable-gen-simd-width=`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). -- `--enable-precision={single|double}`: set the default precision (default: `double`). -- `--enable-precision=`: Use `` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. +- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option** +- `--enable-comms=`: Use `` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `). - `--disable-timers`: disable system dependent high-resolution timers. - `--enable-chroma`: enable Chroma regression tests. @@ -201,8 +200,7 @@ Alternatively, some CPU codenames can be directly used: The following configuration is recommended for the Intel Knights Landing platform: ``` bash -../configure --enable-precision=double\ - --enable-simd=KNL \ +../configure --enable-simd=KNL \ --enable-comms=mpi-auto \ --enable-mkl \ CXX=icpc MPICXX=mpiicpc @@ -212,8 +210,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: ``` bash -../configure --enable-precision=double\ - --enable-simd=KNL \ +../configure --enable-simd=KNL \ --enable-comms=mpi \ --enable-mkl \ CXX=CC CC=cc @@ -232,8 +229,7 @@ for interior communication. This is the mpi3 communications implementation. We recommend four ranks per node for best performance, but optimum is local volume dependent. ``` bash -../configure --enable-precision=double\ - --enable-simd=KNL \ +../configure --enable-simd=KNL \ --enable-comms=mpi3-auto \ --enable-mkl \ CC=icpc MPICXX=mpiicpc @@ -244,8 +240,7 @@ We recommend four ranks per node for best performance, but optimum is local volu The following configuration is recommended for the Intel Haswell platform: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX2 \ +../configure --enable-simd=AVX2 \ --enable-comms=mpi3-auto \ --enable-mkl \ CXX=icpc MPICXX=mpiicpc @@ -262,8 +257,7 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX2 \ +../configure --enable-simd=AVX2 \ --enable-comms=mpi3 \ --enable-mkl \ CXX=CC CC=cc @@ -280,8 +274,7 @@ This is the default. The following configuration is recommended for the Intel Skylake platform: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX512 \ +../configure --enable-simd=AVX512 \ --enable-comms=mpi3 \ --enable-mkl \ CXX=mpiicpc @@ -298,8 +291,7 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX512 \ +../configure --enable-simd=AVX512 \ --enable-comms=mpi3 \ --enable-mkl \ CXX=CC CC=cc @@ -330,8 +322,7 @@ and 8 threads per rank. The following configuration is recommended for the AMD EPYC platform. ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX2 \ +../configure --enable-simd=AVX2 \ --enable-comms=mpi3 \ CXX=mpicxx ``` diff --git a/README.md b/README.md index 9f690ce0..4cbae720 100644 --- a/README.md +++ b/README.md @@ -115,11 +115,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu ``` bash mkdir build; cd build -../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix= +../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix= ``` -where `--enable-precision=` set the default precision, -`--enable-simd=` set the SIMD type, `--enable- +where `--enable-simd=` set the SIMD type, `--enable- comms=`, and `` should be replaced by the prefix path where you want to install Grid. Other options are detailed in the next section, you can also use `configure --help` to display them. Like with any other program using GNU autotool, the @@ -150,8 +149,8 @@ If you want to build all the tests at once just use `make tests`. - `--enable-numa`: enable NUMA first touch optimisation - `--enable-simd=`: setup Grid for the SIMD target `` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. - `--enable-gen-simd-width=`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). -- `--enable-precision={single|double}`: set the default precision (default: `double`). -- `--enable-precision=`: Use `` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. +- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option** +- `--enable-comms=`: Use `` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `). - `--disable-timers`: disable system dependent high-resolution timers. - `--enable-chroma`: enable Chroma regression tests. @@ -205,8 +204,7 @@ Alternatively, some CPU codenames can be directly used: The following configuration is recommended for the Intel Knights Landing platform: ``` bash -../configure --enable-precision=double\ - --enable-simd=KNL \ +../configure --enable-simd=KNL \ --enable-comms=mpi-auto \ --enable-mkl \ CXX=icpc MPICXX=mpiicpc @@ -216,8 +214,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: ``` bash -../configure --enable-precision=double\ - --enable-simd=KNL \ +../configure --enable-simd=KNL \ --enable-comms=mpi \ --enable-mkl \ CXX=CC CC=cc @@ -236,8 +233,7 @@ for interior communication. This is the mpi3 communications implementation. We recommend four ranks per node for best performance, but optimum is local volume dependent. ``` bash -../configure --enable-precision=double\ - --enable-simd=KNL \ +../configure --enable-simd=KNL \ --enable-comms=mpi3-auto \ --enable-mkl \ CC=icpc MPICXX=mpiicpc @@ -248,8 +244,7 @@ We recommend four ranks per node for best performance, but optimum is local volu The following configuration is recommended for the Intel Haswell platform: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX2 \ +../configure --enable-simd=AVX2 \ --enable-comms=mpi3-auto \ --enable-mkl \ CXX=icpc MPICXX=mpiicpc @@ -266,8 +261,7 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX2 \ +../configure --enable-simd=AVX2 \ --enable-comms=mpi3 \ --enable-mkl \ CXX=CC CC=cc @@ -284,8 +278,7 @@ This is the default. The following configuration is recommended for the Intel Skylake platform: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX512 \ +../configure --enable-simd=AVX512 \ --enable-comms=mpi3 \ --enable-mkl \ CXX=mpiicpc @@ -302,8 +295,7 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX512 \ +../configure --enable-simd=AVX512 \ --enable-comms=mpi3 \ --enable-mkl \ CXX=CC CC=cc @@ -334,8 +326,7 @@ and 8 threads per rank. The following configuration is recommended for the AMD EPYC platform. ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX2 \ +../configure --enable-simd=AVX2 \ --enable-comms=mpi3 \ CXX=mpicxx ``` diff --git a/SVE_README.txt b/SVE_README.txt index 0c167c4a..cefec4be 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -12,31 +12,31 @@ module load mpi/openmpi-aarch64 scl enable gcc-toolset-10 bash -../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" +../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" * gcc 10.1 prebuild w/ MPI, QPACE4 interactive login scl enable gcc-toolset-10 bash module load mpi/openmpi-aarch64 -../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" +../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" ------------------------------------------------------------------------------ * armclang 20.2 (qp4) -../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN" +../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN" ------------------------------------------------------------------------------ * gcc 10.0.1 VLA (merlin) -../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static * gcc 10.0.1 fixed-size ACLE (merlin) -../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" * gcc 10.0.1 fixed-size ACLE (fjt) w/ MPI @@ -46,34 +46,34 @@ export OMPI_CXX=g++-10.0.1 export MPICH_CC=gcc-10.0.1 export MPICH_CXX=g++-10.0.1 -$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt" +$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt" -------------------------------------------------------- * armclang 20.0 VLA (merlin) -../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static TODO check ARMCLANGCOMPAT * armclang 20.1 VLA (merlin) -../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static TODO check ARMCLANGCOMPAT * armclang 20.1 VLA (fjt cluster) -../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" +../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" TODO check ARMCLANGCOMPAT * armclang 20.1 VLA w/MPI (fjt cluster) -../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64" +../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64" No ARMCLANGCOMPAT -> still correct ? @@ -81,9 +81,9 @@ No ARMCLANGCOMPAT -> still correct ? * Fujitsu fcc -../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" +../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" * Fujitsu fcc w/ MPI -../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" +../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" diff --git a/documentation/GridXcode/readme.md b/documentation/GridXcode/readme.md index b8342828..3bd6fc30 100644 --- a/documentation/GridXcode/readme.md +++ b/documentation/GridXcode/readme.md @@ -184,19 +184,19 @@ Below are shown the `configure` script invocations for three recommended configu This is the build for every day developing and debugging with Xcode. It uses the Xcode clang c++ compiler, without MPI, and defaults to double-precision. Xcode builds the `Debug` configuration with debug symbols for full debugging: - ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=double --prefix=$GridPre/Debug + ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --prefix=$GridPre/Debug #### 2. `Release` -Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`, except using single-precision (handy for validation): +Since Grid itself doesn't really have debug configurations, the release build is recommended to be the same as `Debug`: - ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --enable-precision=single --prefix=$GridPre/Release + ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=none --prefix=$GridPre/Release #### 3. `MPIDebug` Debug configuration with MPI: - ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --enable-precision=double --prefix=$GridPre/MPIDebug + ../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --prefix=$GridPre/MPIDebug ### 5.3 Build Grid diff --git a/documentation/manual.rst b/documentation/manual.rst index 1596de5e..d51f07c1 100644 --- a/documentation/manual.rst +++ b/documentation/manual.rst @@ -178,15 +178,10 @@ Then enter the cloned directory and set up the build system:: Now you can execute the `configure` script to generate makefiles (here from a build directory):: mkdir build; cd build - ../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto \ + ../configure --enable-simd=AVX --enable-comms=mpi-auto \ --prefix= -where:: - - --enable-precision=single|double - -sets the **default precision**. Since this is largely a benchmarking convenience, it is anticipated that the default precision may be removed in future implementations, -and that explicit type selection be made at all points. Naturally, most code will be type templated in any case.:: +:: --enable-simd=GEN|SSE4|AVX|AVXFMA|AVXFMA4|AVX2|AVX512|NEONv8|QPX @@ -236,7 +231,7 @@ Detailed build configuration options --enable-mkl[=path] use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional). --enable-simd=code setup Grid for the SIMD target ``(default: `GEN`). A list of possible SIMD targets is detailed in a section below. --enable-gen-simd-width=size select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). E.g. SSE 128 bit corresponds to 16 bytes. - --enable-precision=single|double set the default precision (default: `double`). + --enable-precision=single|double set the default precision (default: `double`). **Deprecated option** --enable-comms=mpi|none use `` for message passing (default: `none`). --enable-rng=sitmo|ranlux48|mt19937 choose the RNG (default: `sitmo`). --disable-timers disable system dependent high-resolution timers. @@ -304,8 +299,7 @@ Build setup for Intel Knights Landing platform The following configuration is recommended for the Intel Knights Landing platform:: - ../configure --enable-precision=double\ - --enable-simd=KNL \ + ../configure --enable-simd=KNL \ --enable-comms=mpi-auto \ --enable-mkl \ CXX=icpc MPICXX=mpiicpc @@ -314,8 +308,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:: - ../configure --enable-precision=double\ - --enable-simd=KNL \ + ../configure --enable-simd=KNL \ --enable-comms=mpi \ --enable-mkl \ CXX=CC CC=cc @@ -332,8 +325,7 @@ presently performs better with use of more than one rank per node, using shared for interior communication. We recommend four ranks per node for best performance, but optimum is local volume dependent. :: - ../configure --enable-precision=double\ - --enable-simd=KNL \ + ../configure --enable-simd=KNL \ --enable-comms=mpi-auto \ --enable-mkl \ CC=icpc MPICXX=mpiicpc @@ -343,8 +335,7 @@ Build setup for Intel Haswell Xeon platform The following configuration is recommended for the Intel Haswell platform:: - ../configure --enable-precision=double\ - --enable-simd=AVX2 \ + ../configure --enable-simd=AVX2 \ --enable-comms=mpi-auto \ --enable-mkl \ CXX=icpc MPICXX=mpiicpc @@ -360,8 +351,7 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:: - ../configure --enable-precision=double\ - --enable-simd=AVX2 \ + ../configure --enable-simd=AVX2 \ --enable-comms=mpi \ --enable-mkl \ CXX=CC CC=cc @@ -379,8 +369,7 @@ Build setup for Intel Skylake Xeon platform The following configuration is recommended for the Intel Skylake platform:: - ../configure --enable-precision=double\ - --enable-simd=AVX512 \ + ../configure --enable-simd=AVX512 \ --enable-comms=mpi \ --enable-mkl \ CXX=mpiicpc @@ -396,8 +385,7 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:: - ../configure --enable-precision=double\ - --enable-simd=AVX512 \ + ../configure --enable-simd=AVX512 \ --enable-comms=mpi \ --enable-mkl \ CXX=CC CC=cc @@ -422,8 +410,7 @@ and 8 threads per rank. The following configuration is recommended for the AMD EPYC platform:: - ../configure --enable-precision=double\ - --enable-simd=AVX2 \ + ../configure --enable-simd=AVX2 \ --enable-comms=mpi \ CXX=mpicxx From 3f0620972061a62fe8802e25ca43d896d9172f09 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Oct 2020 22:18:51 -0400 Subject: [PATCH 031/201] Pretty print --- benchmarks/Benchmark_ITT.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index eb275728..54fe1ab0 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -62,7 +62,7 @@ struct time_statistics{ void comms_header(){ std::cout < Date: Tue, 13 Oct 2020 22:23:57 -0400 Subject: [PATCH 032/201] Reality forced included --- Grid/lattice/Lattice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index 28ea0294..9f5f1da7 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -36,7 +36,7 @@ Author: Peter Boyle #include #include #include -//#include +#include #include #include #include From 9945399e609945bae2c01492cbb4ab56a9246ec8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Oct 2020 22:24:32 -0400 Subject: [PATCH 033/201] Reaality issues fix by drop from ET --- Grid/lattice/Lattice_ET.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index c43844f8..8d9f4744 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -342,14 +342,10 @@ inline void ExpressionViewClose(LatticeTrinaryExpression &expr) GridUnopClass(UnarySub, -a); GridUnopClass(UnaryNot, Not(a)); -GridUnopClass(UnaryAdj, adj(a)); -GridUnopClass(UnaryConj, conjugate(a)); GridUnopClass(UnaryTrace, trace(a)); GridUnopClass(UnaryTranspose, transpose(a)); GridUnopClass(UnaryTa, Ta(a)); GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a)); -GridUnopClass(UnaryToReal, toReal(a)); -GridUnopClass(UnaryToComplex, toComplex(a)); GridUnopClass(UnaryTimesI, timesI(a)); GridUnopClass(UnaryTimesMinusI, timesMinusI(a)); GridUnopClass(UnaryAbs, abs(a)); @@ -456,14 +452,12 @@ GridTrinOpClass(TrinaryWhere, GRID_DEF_UNOP(operator-, UnarySub); GRID_DEF_UNOP(Not, UnaryNot); GRID_DEF_UNOP(operator!, UnaryNot); -GRID_DEF_UNOP(adj, UnaryAdj); -GRID_DEF_UNOP(conjugate, UnaryConj); +//GRID_DEF_UNOP(adj, UnaryAdj); +//GRID_DEF_UNOP(conjugate, UnaryConj); GRID_DEF_UNOP(trace, UnaryTrace); GRID_DEF_UNOP(transpose, UnaryTranspose); GRID_DEF_UNOP(Ta, UnaryTa); GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup); -GRID_DEF_UNOP(toReal, UnaryToReal); -GRID_DEF_UNOP(toComplex, UnaryToComplex); GRID_DEF_UNOP(timesI, UnaryTimesI); GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI); GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the From aa135412f554b5712a62164fdd3136f7e38e16c5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Oct 2020 22:25:01 -0400 Subject: [PATCH 034/201] toComplex, toReal --- Grid/lattice/Lattice_reality.h | 37 ++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/Grid/lattice/Lattice_reality.h b/Grid/lattice/Lattice_reality.h index 61491d6b..e07dd545 100644 --- a/Grid/lattice/Lattice_reality.h +++ b/Grid/lattice/Lattice_reality.h @@ -64,6 +64,43 @@ template inline Lattice conjugate(const Lattice &lhs){ return ret; }; +template inline Lattice toComplex(const Lattice &lhs){ + Lattice ret(lhs.Grid()); + + autoView( lhs_v, lhs, AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); + + ret.Checkerboard() = lhs.Checkerboard(); + accelerator_for( ss, lhs_v.size(), 1, { + ret_v[ss] = toComplex(lhs_v[ss]); + }); + return ret; +}; +template inline Lattice toReal(const Lattice &lhs){ + Lattice ret(lhs.Grid()); + + autoView( lhs_v, lhs, AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); + + ret.Checkerboard() = lhs.Checkerboard(); + accelerator_for( ss, lhs_v.size(), 1, { + ret_v[ss] = toReal(lhs_v[ss]); + }); + return ret; +}; + + +template::value,void>::type * = nullptr> +auto toComplex(const Expression &expr) -> decltype(closure(expr)) +{ + return toComplex(closure(expr)); +} +template::value,void>::type * = nullptr> +auto toReal(const Expression &expr) -> decltype(closure(expr)) +{ + return toReal(closure(expr)); +} + NAMESPACE_END(Grid); #endif From a88b3ceca57b616b21e88896a02d5bf224de7242 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 14 Oct 2020 21:33:51 -0400 Subject: [PATCH 035/201] Closure cases --- Grid/lattice/Lattice_reality.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Grid/lattice/Lattice_reality.h b/Grid/lattice/Lattice_reality.h index e07dd545..2e80ce4a 100644 --- a/Grid/lattice/Lattice_reality.h +++ b/Grid/lattice/Lattice_reality.h @@ -100,6 +100,16 @@ auto toReal(const Expression &expr) -> decltype(closure(expr)) { return toReal(closure(expr)); } +template::value,void>::type * = nullptr> +auto adj(const Expression &expr) -> decltype(closure(expr)) +{ + return adj(closure(expr)); +} +template::value,void>::type * = nullptr> +auto conjugate(const Expression &expr) -> decltype(closure(expr)) +{ + return conjugate(closure(expr)); +} NAMESPACE_END(Grid); From bf3c9857e0409036ddb9922775e3c7c8d7e331af Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 14 Oct 2020 21:37:14 -0400 Subject: [PATCH 036/201] Closure changes --- .../implementation/WilsonCloverFermionImplementation.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h index df1bce7c..e721c20d 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h @@ -133,14 +133,14 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) pickCheckerboard(Even, CloverTermEven, CloverTerm); pickCheckerboard(Odd, CloverTermOdd, CloverTerm); - pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm))); - pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm))); + pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm)); + pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm)); pickCheckerboard(Even, CloverTermInvEven, CloverTermInv); pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv); - pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv))); - pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv))); + pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv)); + pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv)); } template From 3362f8dfa0b9e1122a5923d9d13becdab534e54a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 14 Oct 2020 22:59:41 -0400 Subject: [PATCH 037/201] happy compile --- Grid/lattice/Lattice_ET.h | 16 ++--- Grid/lattice/Lattice_reality.h | 4 +- Grid/qcd/utils/SUn.h | 3 +- tests/core/Test_lie_generators.cc | 112 +++++++++++++++--------------- 4 files changed, 68 insertions(+), 67 deletions(-) diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index 8d9f4744..f828ef30 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -488,27 +488,27 @@ GRID_DEF_TRINOP(where, TrinaryWhere); ///////////////////////////////////////////////////////////// template auto closure(const LatticeUnaryExpression &expr) - -> Lattice + -> Lattice::type > { - Lattice ret(expr); + Lattice::type > ret(expr); return ret; } template auto closure(const LatticeBinaryExpression &expr) - -> Lattice + -> Lattice::type > { - Lattice ret(expr); + Lattice::type > ret(expr); return ret; } template auto closure(const LatticeTrinaryExpression &expr) - -> Lattice Lattice + vecEval(0, expr.arg3)))>::type > { - Lattice ret(expr); + vecEval(0, expr.arg3)))>::type > ret(expr); return ret; } #define EXPRESSION_CLOSURE(function) \ diff --git a/Grid/lattice/Lattice_reality.h b/Grid/lattice/Lattice_reality.h index 2e80ce4a..51deeb01 100644 --- a/Grid/lattice/Lattice_reality.h +++ b/Grid/lattice/Lattice_reality.h @@ -45,8 +45,8 @@ template inline Lattice adj(const Lattice &lhs){ autoView( ret_v, ret, AcceleratorWrite); ret.Checkerboard()=lhs.Checkerboard(); - accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { - coalescedWrite(ret_v[ss], adj(lhs_v(ss))); + accelerator_for( ss, lhs_v.size(), 1, { + ret_v[ss] = adj(lhs_v[ss]); }); return ret; }; diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h index 0cc0cc1a..7ac53246 100644 --- a/Grid/qcd/utils/SUn.h +++ b/Grid/qcd/utils/SUn.h @@ -449,7 +449,8 @@ public: LatticeReal alpha(grid); // std::cout<::printGenerators(); - std::cout << "Dimension of adjoint representation: "<< SUAdjoint::Dimension << std::endl; - SUAdjoint::printGenerators(); - SU::testGenerators(); - SUAdjoint::testGenerators(); + SU3::printGenerators(); + std::cout << "Dimension of adjoint representation: "<< SU3Adjoint::Dimension << std::endl; + SU3Adjoint::printGenerators(); + SU3::testGenerators(); + SU3Adjoint::testGenerators(); std::cout<({45,12,81,9})); - SUAdjoint::LatticeAdjMatrix Gauss(grid); - SU::LatticeAlgebraVector ha(grid); - SU::LatticeAlgebraVector hb(grid); + SU3Adjoint::LatticeAdjMatrix Gauss(grid); + SU3::LatticeAlgebraVector ha(grid); + SU3::LatticeAlgebraVector hb(grid); random(gridRNG,Gauss); std::cout << GridLogMessage << "Start projectOnAlgebra" << std::endl; - SUAdjoint::projectOnAlgebra(ha, Gauss); + SU3Adjoint::projectOnAlgebra(ha, Gauss); std::cout << GridLogMessage << "end projectOnAlgebra" << std::endl; std::cout << GridLogMessage << "Start projector" << std::endl; - SUAdjoint::projector(hb, Gauss); + SU3Adjoint::projector(hb, Gauss); std::cout << GridLogMessage << "end projector" << std::endl; std::cout << GridLogMessage << "ReStart projector" << std::endl; - SUAdjoint::projector(hb, Gauss); + SU3Adjoint::projector(hb, Gauss); std::cout << GridLogMessage << "end projector" << std::endl; - SU::LatticeAlgebraVector diff = ha -hb; + SU3::LatticeAlgebraVector diff = ha -hb; std::cout << GridLogMessage << "Difference: " << norm2(diff) << std::endl; @@ -114,8 +114,8 @@ int main(int argc, char** argv) { LatticeGaugeField U(grid), V(grid); - SU::HotConfiguration(gridRNG, U); - SU::HotConfiguration(gridRNG, V); + SU3::HotConfiguration(gridRNG, U); + SU3::HotConfiguration(gridRNG, V); // Adjoint representation // Test group structure @@ -123,8 +123,8 @@ int main(int argc, char** argv) { LatticeGaugeField UV(grid); UV = Zero(); for (int mu = 0; mu < Nd; mu++) { - SU::LatticeMatrix Umu = peekLorentz(U,mu); - SU::LatticeMatrix Vmu = peekLorentz(V,mu); + SU3::LatticeMatrix Umu = peekLorentz(U,mu); + SU3::LatticeMatrix Vmu = peekLorentz(V,mu); pokeLorentz(UV,Umu*Vmu, mu); } @@ -151,16 +151,16 @@ int main(int argc, char** argv) { // Check correspondence of algebra and group transformations // Create a random vector - SU::LatticeAlgebraVector h_adj(grid); + SU3::LatticeAlgebraVector h_adj(grid); typename AdjointRep::LatticeMatrix Ar(grid); random(gridRNG,h_adj); h_adj = real(h_adj); SU_Adjoint::AdjointLieAlgebraMatrix(h_adj,Ar); // Re-extract h_adj - SU::LatticeAlgebraVector h_adj2(grid); + SU3::LatticeAlgebraVector h_adj2(grid); SU_Adjoint::projectOnAlgebra(h_adj2, Ar); - SU::LatticeAlgebraVector h_diff = h_adj - h_adj2; + SU3::LatticeAlgebraVector h_diff = h_adj - h_adj2; std::cout << GridLogMessage << "Projections structure check vector difference (Adjoint representation) : " << norm2(h_diff) << std::endl; // Exponentiate @@ -183,14 +183,14 @@ int main(int argc, char** argv) { // Construct the fundamental matrix in the group - SU::LatticeMatrix Af(grid); - SU::FundamentalLieAlgebraMatrix(h_adj,Af); - SU::LatticeMatrix Ufund(grid); + SU3::LatticeMatrix Af(grid); + SU3::FundamentalLieAlgebraMatrix(h_adj,Af); + SU3::LatticeMatrix Ufund(grid); Ufund = expMat(Af, 1.0, 16); // Check unitarity - SU::LatticeMatrix uno_f(grid); + SU3::LatticeMatrix uno_f(grid); uno_f = 1.0; - SU::LatticeMatrix UnitCheck(grid); + SU3::LatticeMatrix UnitCheck(grid); UnitCheck = Ufund * adj(Ufund) - uno_f; std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck) << std::endl; @@ -260,20 +260,20 @@ int main(int argc, char** argv) { std::cout << GridLogMessage << "Test for the Two Index Symmetric projectors" << std::endl; // Projectors - SUTwoIndexSymm::LatticeTwoIndexMatrix Gauss2(grid); + SU3TwoIndexSymm::LatticeTwoIndexMatrix Gauss2(grid); random(gridRNG,Gauss2); std::cout << GridLogMessage << "Start projectOnAlgebra" << std::endl; - SUTwoIndexSymm::projectOnAlgebra(ha, Gauss2); + SU3TwoIndexSymm::projectOnAlgebra(ha, Gauss2); std::cout << GridLogMessage << "end projectOnAlgebra" << std::endl; std::cout << GridLogMessage << "Start projector" << std::endl; - SUTwoIndexSymm::projector(hb, Gauss2); + SU3TwoIndexSymm::projector(hb, Gauss2); std::cout << GridLogMessage << "end projector" << std::endl; std::cout << GridLogMessage << "ReStart projector" << std::endl; - SUTwoIndexSymm::projector(hb, Gauss2); + SU3TwoIndexSymm::projector(hb, Gauss2); std::cout << GridLogMessage << "end projector" << std::endl; - SU::LatticeAlgebraVector diff2 = ha - hb; + SU3::LatticeAlgebraVector diff2 = ha - hb; std::cout << GridLogMessage << "Difference: " << norm2(diff) << std::endl; std::cout << GridLogMessage << "*********************************************" << std::endl; @@ -284,20 +284,20 @@ int main(int argc, char** argv) { std::cout << GridLogMessage << "Test for the Two index anti-Symmetric projectors" << std::endl; // Projectors - SUTwoIndexAntiSymm::LatticeTwoIndexMatrix Gauss2a(grid); + SU3TwoIndexAntiSymm::LatticeTwoIndexMatrix Gauss2a(grid); random(gridRNG,Gauss2a); std::cout << GridLogMessage << "Start projectOnAlgebra" << std::endl; - SUTwoIndexAntiSymm::projectOnAlgebra(ha, Gauss2a); + SU3TwoIndexAntiSymm::projectOnAlgebra(ha, Gauss2a); std::cout << GridLogMessage << "end projectOnAlgebra" << std::endl; std::cout << GridLogMessage << "Start projector" << std::endl; - SUTwoIndexAntiSymm::projector(hb, Gauss2a); + SU3TwoIndexAntiSymm::projector(hb, Gauss2a); std::cout << GridLogMessage << "end projector" << std::endl; std::cout << GridLogMessage << "ReStart projector" << std::endl; - SUTwoIndexAntiSymm::projector(hb, Gauss2a); + SU3TwoIndexAntiSymm::projector(hb, Gauss2a); std::cout << GridLogMessage << "end projector" << std::endl; - SU::LatticeAlgebraVector diff2a = ha - hb; + SU3::LatticeAlgebraVector diff2a = ha - hb; std::cout << GridLogMessage << "Difference: " << norm2(diff2a) << std::endl; std::cout << GridLogMessage << "*********************************************" << std::endl; @@ -311,14 +311,14 @@ int main(int argc, char** argv) { // Test group structure // (U_f * V_f)_r = U_r * V_r LatticeGaugeField U2(grid), V2(grid); - SU::HotConfiguration(gridRNG, U2); - SU::HotConfiguration(gridRNG, V2); + SU3::HotConfiguration(gridRNG, U2); + SU3::HotConfiguration(gridRNG, V2); LatticeGaugeField UV2(grid); UV2 = Zero(); for (int mu = 0; mu < Nd; mu++) { - SU::LatticeMatrix Umu2 = peekLorentz(U2,mu); - SU::LatticeMatrix Vmu2 = peekLorentz(V2,mu); + SU3::LatticeMatrix Umu2 = peekLorentz(U2,mu); + SU3::LatticeMatrix Vmu2 = peekLorentz(V2,mu); pokeLorentz(UV2,Umu2*Vmu2, mu); } @@ -345,16 +345,16 @@ int main(int argc, char** argv) { // Check correspondence of algebra and group transformations // Create a random vector - SU::LatticeAlgebraVector h_sym(grid); + SU3::LatticeAlgebraVector h_sym(grid); typename TwoIndexRep< Nc, Symmetric>::LatticeMatrix Ar_sym(grid); random(gridRNG,h_sym); h_sym = real(h_sym); SU_TwoIndex::TwoIndexLieAlgebraMatrix(h_sym,Ar_sym); // Re-extract h_sym - SU::LatticeAlgebraVector h_sym2(grid); + SU3::LatticeAlgebraVector h_sym2(grid); SU_TwoIndex< Nc, Symmetric>::projectOnAlgebra(h_sym2, Ar_sym); - SU::LatticeAlgebraVector h_diff_sym = h_sym - h_sym2; + SU3::LatticeAlgebraVector h_diff_sym = h_sym - h_sym2; std::cout << GridLogMessage << "Projections structure check vector difference (Two Index Symmetric): " << norm2(h_diff_sym) << std::endl; @@ -379,11 +379,11 @@ int main(int argc, char** argv) { // Construct the fundamental matrix in the group - SU::LatticeMatrix Af_sym(grid); - SU::FundamentalLieAlgebraMatrix(h_sym,Af_sym); - SU::LatticeMatrix Ufund2(grid); + SU3::LatticeMatrix Af_sym(grid); + SU3::FundamentalLieAlgebraMatrix(h_sym,Af_sym); + SU3::LatticeMatrix Ufund2(grid); Ufund2 = expMat(Af_sym, 1.0, 16); - SU::LatticeMatrix UnitCheck2(grid); + SU3::LatticeMatrix UnitCheck2(grid); UnitCheck2 = Ufund2 * adj(Ufund2) - uno_f; std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2) << std::endl; @@ -421,14 +421,14 @@ int main(int argc, char** argv) { // Test group structure // (U_f * V_f)_r = U_r * V_r LatticeGaugeField U2A(grid), V2A(grid); - SU::HotConfiguration(gridRNG, U2A); - SU::HotConfiguration(gridRNG, V2A); + SU3::HotConfiguration(gridRNG, U2A); + SU3::HotConfiguration(gridRNG, V2A); LatticeGaugeField UV2A(grid); UV2A = Zero(); for (int mu = 0; mu < Nd; mu++) { - SU::LatticeMatrix Umu2A = peekLorentz(U2,mu); - SU::LatticeMatrix Vmu2A = peekLorentz(V2,mu); + SU3::LatticeMatrix Umu2A = peekLorentz(U2,mu); + SU3::LatticeMatrix Vmu2A = peekLorentz(V2,mu); pokeLorentz(UV2A,Umu2A*Vmu2A, mu); } @@ -455,16 +455,16 @@ int main(int argc, char** argv) { // Check correspondence of algebra and group transformations // Create a random vector - SU::LatticeAlgebraVector h_Asym(grid); + SU3::LatticeAlgebraVector h_Asym(grid); typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Ar_Asym(grid); random(gridRNG,h_Asym); h_Asym = real(h_Asym); SU_TwoIndex< Nc, AntiSymmetric>::TwoIndexLieAlgebraMatrix(h_Asym,Ar_Asym); // Re-extract h_sym - SU::LatticeAlgebraVector h_Asym2(grid); + SU3::LatticeAlgebraVector h_Asym2(grid); SU_TwoIndex< Nc, AntiSymmetric>::projectOnAlgebra(h_Asym2, Ar_Asym); - SU::LatticeAlgebraVector h_diff_Asym = h_Asym - h_Asym2; + SU3::LatticeAlgebraVector h_diff_Asym = h_Asym - h_Asym2; std::cout << GridLogMessage << "Projections structure check vector difference (Two Index anti-Symmetric): " << norm2(h_diff_Asym) << std::endl; @@ -489,11 +489,11 @@ int main(int argc, char** argv) { // Construct the fundamental matrix in the group - SU::LatticeMatrix Af_Asym(grid); - SU::FundamentalLieAlgebraMatrix(h_Asym,Af_Asym); - SU::LatticeMatrix Ufund2A(grid); + SU3::LatticeMatrix Af_Asym(grid); + SU3::FundamentalLieAlgebraMatrix(h_Asym,Af_Asym); + SU3::LatticeMatrix Ufund2A(grid); Ufund2A = expMat(Af_Asym, 1.0, 16); - SU::LatticeMatrix UnitCheck2A(grid); + SU3::LatticeMatrix UnitCheck2A(grid); UnitCheck2A = Ufund2A * adj(Ufund2A) - uno_f; std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2A) << std::endl; From d060341168ab9082649f813fc3727fb65f08d3cd Mon Sep 17 00:00:00 2001 From: KANAMORI Issaku Date: Fri, 16 Oct 2020 21:39:17 +0900 Subject: [PATCH 038/201] add an error check for Parameters.StartingType --- Grid/qcd/hmc/GenericHMCrunner.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Grid/qcd/hmc/GenericHMCrunner.h b/Grid/qcd/hmc/GenericHMCrunner.h index c2443dd0..98e8175a 100644 --- a/Grid/qcd/hmc/GenericHMCrunner.h +++ b/Grid/qcd/hmc/GenericHMCrunner.h @@ -159,6 +159,13 @@ private: Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U, Resources.GetSerialRNG(), Resources.GetParallelRNG()); + } else { + // others + std::cout << GridLogError << "Unrecognized StartingType\n"; + std::cout + << GridLogError + << "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + exit(1); } Smearing.set_Field(U); From 463d72d322581a256369e9c42cdb30ce0e5595fc Mon Sep 17 00:00:00 2001 From: Raoul Hodgson Date: Mon, 19 Oct 2020 16:13:28 +0100 Subject: [PATCH 039/201] Added untraced baryon contraction code --- Grid/qcd/utils/BaryonUtils.h | 231 +++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index b268b684..beab3436 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -61,6 +61,16 @@ public: const int parity, const bool * wick_contractions, robj &result); + template + static void baryon_site_matrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool * wick_contractions, + robj &result); public: static void Wick_Contractions(std::string qi, std::string qf, @@ -75,6 +85,15 @@ public: const bool* wick_contractions, const int parity, ComplexField &baryon_corr); + static void ContractBaryons_matrix(const PropagatorField &q1_left, + const PropagatorField &q2_left, + const PropagatorField &q3_left, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + SpinMatrixField &baryon_corr); template static void ContractBaryons_Sliced(const mobj &D1, const mobj &D2, @@ -87,6 +106,17 @@ public: const int parity, const int nt, robj &result); + template + static void ContractBaryons_Sliced_matrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + const int nt, + robj &result); private: template static void Baryon_Gamma_3pt_Group1_Site( @@ -329,6 +359,126 @@ void BaryonUtils::baryon_site(const mobj &D1, }} } +//New version without parity projection or trace +template +template +void BaryonUtils::baryon_site_matrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_i, + const Gamma GammaB_i, + const Gamma GammaA_f, + const Gamma GammaB_f, + const bool * wick_contraction, + robj &result) +{ + + auto D1_GAi = D1 * GammaA_i; + auto GAf_D1_GAi = GammaA_f * D1_GAi; + auto GBf_D1_GAi = GammaB_f * D1_GAi; + + auto D2_GBi = D2 * GammaB_i; + auto GBf_D2_GBi = GammaB_f * D2_GBi; + auto GAf_D2_GBi = GammaA_f * D2_GBi; + + auto GBf_D3 = GammaB_f * D3; + auto GAf_D3 = GammaA_f * D3; + + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = epsilon[ie_f][0]; //a + int b_f = epsilon[ie_f][1]; //b + int c_f = epsilon[ie_f][2]; //c + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = epsilon[ie_i][0]; //a' + int b_i = epsilon[ie_i][1]; //b' + int c_i = epsilon[ie_i][2]; //c' + + Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + //This is the \delta_{456}^{123} part + if (wick_contraction[0]){ + for (int rho_i=0; rho_i::ContractBaryons(const PropagatorField &q1_left, t += usecond(); std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; +} + +template +void BaryonUtils::ContractBaryons_matrix(const PropagatorField &q1_left, + const PropagatorField &q2_left, + const PropagatorField &q3_left, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + SpinMatrixField &baryon_corr) +{ + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; + std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; + std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; + std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; + + GridBase *grid = q1_left.Grid(); + + autoView(vbaryon_corr, baryon_corr,CpuWrite); + autoView( v1 , q1_left, CpuRead); + autoView( v2 , q2_left, CpuRead); + autoView( v3 , q3_left, CpuRead); + + // Real bytes =0.; + // bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real)); + // for (int ie=0; ie < 6 ; ie++){ + // if(ie==0 or ie==3){ + // bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie]; + // } + // else{ + // bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie]; + // } + // } + // Real t=0.; + // t =-usecond(); + + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto D1 = v1[ss]; + auto D2 = v2[ss]; + auto D3 = v3[ss]; + sobj result=Zero(); + baryon_site_matrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result); + vbaryon_corr[ss] = result; + } );//end loop over lattice sites + + // t += usecond(); + + // std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; } @@ -442,6 +646,33 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, } } +template +template +void BaryonUtils::ContractBaryons_Sliced_matrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + const int nt, + robj &result) +{ + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; + std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; + std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; + std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; + + for (int t=0; t Date: Wed, 21 Oct 2020 11:58:53 +0100 Subject: [PATCH 040/201] BaryonUtils function naming change --- Grid/qcd/utils/BaryonUtils.h | 86 ++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index beab3436..1259225a 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -51,7 +51,7 @@ public: private: template - static void baryon_site(const mobj &D1, + static void BaryonSite(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -62,7 +62,7 @@ public: const bool * wick_contractions, robj &result); template - static void baryon_site_matrix(const mobj &D1, + static void BaryonSiteMatrix(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -72,7 +72,7 @@ public: const bool * wick_contractions, robj &result); public: - static void Wick_Contractions(std::string qi, + static void WickContractions(std::string qi, std::string qf, bool* wick_contractions); static void ContractBaryons(const PropagatorField &q1_left, @@ -85,7 +85,7 @@ public: const bool* wick_contractions, const int parity, ComplexField &baryon_corr); - static void ContractBaryons_matrix(const PropagatorField &q1_left, + static void ContractBaryonsMatrix(const PropagatorField &q1_left, const PropagatorField &q2_left, const PropagatorField &q3_left, const Gamma GammaA_left, @@ -95,7 +95,7 @@ public: const bool* wick_contractions, SpinMatrixField &baryon_corr); template - static void ContractBaryons_Sliced(const mobj &D1, + static void ContractBaryonsSliced(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -107,7 +107,7 @@ public: const int nt, robj &result); template - static void ContractBaryons_Sliced_matrix(const mobj &D1, + static void ContractBaryonsSlicedMatrix(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -119,7 +119,7 @@ public: robj &result); private: template - static void Baryon_Gamma_3pt_Group1_Site( + static void BaryonGamma3ptGroup1Site( const mobj &Dq1_ti, const mobj2 &Dq2_spec, const mobj2 &Dq3_spec, @@ -131,7 +131,7 @@ public: robj &result); template - static void Baryon_Gamma_3pt_Group2_Site( + static void BaryonGamma3ptGroup2Site( const mobj2 &Dq1_spec, const mobj &Dq2_ti, const mobj2 &Dq3_spec, @@ -143,7 +143,7 @@ public: robj &result); template - static void Baryon_Gamma_3pt_Group3_Site( + static void BaryonGamma3ptGroup3Site( const mobj2 &Dq1_spec, const mobj2 &Dq2_spec, const mobj &Dq3_ti, @@ -155,7 +155,7 @@ public: robj &result); public: template - static void Baryon_Gamma_3pt( + static void BaryonGamma3pt( const PropagatorField &q_ti, const mobj &Dq_spec1, const mobj &Dq_spec2, @@ -168,7 +168,7 @@ public: SpinMatrixField &stn_corr); private: template - static void Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, + static void SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, @@ -177,7 +177,7 @@ public: const Gamma GammaB_nucl, robj &result); template - static void Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti, + static void SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, const mobj &Du_tf, const mobj2 &Du_spec, const mobj &Dd_tf, @@ -189,7 +189,7 @@ public: template - static void Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop, + static void SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, @@ -198,7 +198,7 @@ public: const Gamma GammaB_nucl, robj &result); template - static void Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti, + static void SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, const mobj &Du_tf, const mobj2 &Du_spec, const mobj &Dd_tf, @@ -209,7 +209,7 @@ public: robj &result); public: template - static void Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, + static void SigmaToNucleonEye(const PropagatorField &qq_loop, const mobj &Du_spec, const PropagatorField &qd_tf, const PropagatorField &qs_ti, @@ -219,7 +219,7 @@ public: const std::string op, SpinMatrixField &stn_corr); template - static void Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, + static void SigmaToNucleonNonEye(const PropagatorField &qq_ti, const PropagatorField &qq_tf, const mobj &Du_spec, const PropagatorField &qd_tf, @@ -247,7 +247,7 @@ const Real BaryonUtils::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.}; //This is the old version template template -void BaryonUtils::baryon_site(const mobj &D1, +void BaryonUtils::BaryonSite(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_i, @@ -362,7 +362,7 @@ void BaryonUtils::baryon_site(const mobj &D1, //New version without parity projection or trace template template -void BaryonUtils::baryon_site_matrix(const mobj &D1, +void BaryonUtils::BaryonSiteMatrix(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_i, @@ -484,7 +484,7 @@ void BaryonUtils::baryon_site_matrix(const mobj &D1, * flavours. * * The array wick_contractions must be of length 6 */ template -void BaryonUtils::Wick_Contractions(std::string qi, std::string qf, bool* wick_contractions) { +void BaryonUtils::WickContractions(std::string qi, std::string qf, bool* wick_contractions) { const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; for (int ie=0; ie < 6 ; ie++) { wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3 @@ -547,7 +547,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, auto D2 = v2[ss]; auto D3 = v3[ss]; vobj result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result); + BaryonSite(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result); vbaryon_corr[ss] = result; } );//end loop over lattice sites @@ -557,7 +557,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, } template -void BaryonUtils::ContractBaryons_matrix(const PropagatorField &q1_left, +void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, const PropagatorField &q2_left, const PropagatorField &q3_left, const Gamma GammaA_left, @@ -601,7 +601,7 @@ void BaryonUtils::ContractBaryons_matrix(const PropagatorField &q1_left, auto D2 = v2[ss]; auto D3 = v3[ss]; sobj result=Zero(); - baryon_site_matrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result); + BaryonSiteMatrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result); vbaryon_corr[ss] = result; } );//end loop over lattice sites @@ -618,7 +618,7 @@ void BaryonUtils::ContractBaryons_matrix(const PropagatorField &q1_left, * Wick_Contractions function above */ template template -void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, +void BaryonUtils::ContractBaryonsSliced(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -642,13 +642,13 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); for (int t=0; t template -void BaryonUtils::ContractBaryons_Sliced_matrix(const mobj &D1, +void BaryonUtils::ContractBaryonsSlicedMatrix(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -669,7 +669,7 @@ void BaryonUtils::ContractBaryons_Sliced_matrix(const mobj &D1, std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; for (int t=0; t::ContractBaryons_Sliced_matrix(const mobj &D1, * Dq4_tf is a quark line from t_f to t_J */ template template -void BaryonUtils::Baryon_Gamma_3pt_Group1_Site( +void BaryonUtils::BaryonGamma3ptGroup1Site( const mobj &Dq1_ti, const mobj2 &Dq2_spec, const mobj2 &Dq3_spec, @@ -777,7 +777,7 @@ void BaryonUtils::Baryon_Gamma_3pt_Group1_Site( * Dq4_tf is a quark line from t_f to t_J */ template template -void BaryonUtils::Baryon_Gamma_3pt_Group2_Site( +void BaryonUtils::BaryonGamma3ptGroup2Site( const mobj2 &Dq1_spec, const mobj &Dq2_ti, const mobj2 &Dq3_spec, @@ -867,7 +867,7 @@ void BaryonUtils::Baryon_Gamma_3pt_Group2_Site( * Dq4_tf is a quark line from t_f to t_J */ template template -void BaryonUtils::Baryon_Gamma_3pt_Group3_Site( +void BaryonUtils::BaryonGamma3ptGroup3Site( const mobj2 &Dq1_spec, const mobj2 &Dq2_spec, const mobj &Dq3_ti, @@ -959,7 +959,7 @@ void BaryonUtils::Baryon_Gamma_3pt_Group3_Site( * https://aportelli.github.io/Hadrons-doc/#/mcontraction */ template template -void BaryonUtils::Baryon_Gamma_3pt( +void BaryonUtils::BaryonGamma3pt( const PropagatorField &q_ti, const mobj &Dq_spec1, const mobj &Dq_spec2, @@ -982,7 +982,7 @@ void BaryonUtils::Baryon_Gamma_3pt( auto Dq_ti = vq_ti[ss]; auto Dq_tf = vq_tf[ss]; sobj result=Zero(); - Baryon_Gamma_3pt_Group1_Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); vcorr[ss] += result; });//end loop over lattice sites } else if (group == 2) { @@ -990,7 +990,7 @@ void BaryonUtils::Baryon_Gamma_3pt( auto Dq_ti = vq_ti[ss]; auto Dq_tf = vq_tf[ss]; sobj result=Zero(); - Baryon_Gamma_3pt_Group2_Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); vcorr[ss] += result; });//end loop over lattice sites } else if (group == 3) { @@ -998,7 +998,7 @@ void BaryonUtils::Baryon_Gamma_3pt( auto Dq_ti = vq_ti[ss]; auto Dq_tf = vq_tf[ss]; sobj result=Zero(); - Baryon_Gamma_3pt_Group3_Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + BaryonGamma3ptGroup3Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); vcorr[ss] += result; });//end loop over lattice sites @@ -1018,7 +1018,7 @@ void BaryonUtils::Baryon_Gamma_3pt( * Ds_ti is a quark line from t_i to t_H */ template template -void BaryonUtils::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, +void BaryonUtils::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, @@ -1069,7 +1069,7 @@ void BaryonUtils::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, * Ds_ti is a quark line from t_i to t_H */ template template -void BaryonUtils::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti, +void BaryonUtils::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, const mobj &Du_tf, const mobj2 &Du_spec, const mobj &Dd_tf, @@ -1128,7 +1128,7 @@ void BaryonUtils::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti, * Ds_ti is a quark line from t_i to t_H */ template template -void BaryonUtils::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop, +void BaryonUtils::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, @@ -1179,7 +1179,7 @@ void BaryonUtils::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop, * Ds_ti is a quark line from t_i to t_H */ template template -void BaryonUtils::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti, +void BaryonUtils::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, const mobj &Du_tf, const mobj2 &Du_spec, const mobj &Dd_tf, @@ -1233,7 +1233,7 @@ void BaryonUtils::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti, template template -void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, +void BaryonUtils::SigmaToNucleonEye(const PropagatorField &qq_loop, const mobj &Du_spec, const PropagatorField &qd_tf, const PropagatorField &qs_ti, @@ -1260,9 +1260,9 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, auto Ds_ti = vs_ti[ss]; sobj result=Zero(); if(op == "Q1"){ - Sigma_to_Nucleon_Q1_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ1EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else if(op == "Q2"){ - Sigma_to_Nucleon_Q2_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ2EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else { assert(0 && "Weak Operator not correctly specified"); } @@ -1272,7 +1272,7 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, template template -void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, +void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, const PropagatorField &qq_tf, const mobj &Du_spec, const PropagatorField &qd_tf, @@ -1302,9 +1302,9 @@ void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, auto Ds_ti = vs_ti[ss]; sobj result=Zero(); if(op == "Q1"){ - Sigma_to_Nucleon_Q1_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ1NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else if(op == "Q2"){ - Sigma_to_Nucleon_Q2_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ2NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else { assert(0 && "Weak Operator not correctly specified"); } From 52d17987dc3e09f537fa8d7d3a9403712e3d62c5 Mon Sep 17 00:00:00 2001 From: Raoul Hodgson Date: Fri, 23 Oct 2020 11:41:08 +0100 Subject: [PATCH 041/201] BaryonUtils.h updated debug output --- Grid/qcd/utils/BaryonUtils.h | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 1259225a..15516b56 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -514,11 +514,6 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); @@ -553,7 +548,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, t += usecond(); - std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; + std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; } template @@ -570,11 +565,6 @@ void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; GridBase *grid = q1_left.Grid(); @@ -607,7 +597,7 @@ void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, // t += usecond(); - // std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; + // std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; } @@ -633,11 +623,6 @@ void BaryonUtils::ContractBaryonsSliced(const mobj &D1, assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); @@ -663,11 +648,6 @@ void BaryonUtils::ContractBaryonsSlicedMatrix(const mobj &D1, assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; - for (int t=0; t Date: Tue, 20 Oct 2020 10:11:43 +0200 Subject: [PATCH 042/201] Thread inversion of clover term --- .../WilsonCloverFermionImplementation.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h index e721c20d..3032a80c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h @@ -92,20 +92,16 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) int lvol = _Umu.Grid()->lSites(); int DimRep = Impl::Dimension; - Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); - Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); - - Coordinate lcoor; - typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); - { autoView(CTv,CloverTerm,CpuRead); autoView(CTIv,CloverTermInv,CpuWrite); - for (int site = 0; site < lvol; site++) { + thread_for(site, lvol, { + Coordinate lcoor; grid->LocalIndexToLocalCoor(site, lcoor); - EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); peekLocalSite(Qx, CTv, lcoor); - Qxinv = Zero(); //if (csw!=0){ for (int j = 0; j < Ns; j++) for (int k = 0; k < Ns; k++) @@ -126,7 +122,7 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) // if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl; // } pokeLocalSite(Qxinv, CTIv, lcoor); - } + }); } // Separate the even and odd parts From f313565a3cdc9e9487921f7c4d33c14c1292082b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 31 Oct 2020 12:12:40 +0000 Subject: [PATCH 043/201] HiP compile --- Grid/serialisation/JSON_IO.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/serialisation/JSON_IO.cc b/Grid/serialisation/JSON_IO.cc index aca8bab3..f2282099 100644 --- a/Grid/serialisation/JSON_IO.cc +++ b/Grid/serialisation/JSON_IO.cc @@ -26,7 +26,7 @@ *************************************************************************************/ /* END LEGAL */ #include -#ifndef __NVCC__ +#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) NAMESPACE_BEGIN(Grid); From d10422ded88d34a6e72c16c31cfc74dd4805ca5b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 31 Oct 2020 18:12:30 -0400 Subject: [PATCH 044/201] Test project on group --- tests/core/Test_unary.cc | 106 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 tests/core/Test_unary.cc diff --git a/tests/core/Test_unary.cc b/tests/core/Test_unary.cc new file mode 100644 index 00000000..2ad6ba7b --- /dev/null +++ b/tests/core/Test_unary.cc @@ -0,0 +1,106 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_quenched_update.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt({8,8,8,8}); + GridCartesian * grid = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexD::Nsimd()), + GridDefaultMpi()); + + GridCartesian * gridF = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexF::Nsimd()), + GridDefaultMpi()); + + + /////////////////////////////// + // Configuration of known size + /////////////////////////////// + LatticeColourMatrixD ident(grid); + LatticeColourMatrixD U(grid); + LatticeColourMatrixD tmp(grid); + LatticeColourMatrixD org(grid); + LatticeColourMatrixF UF(gridF); + + LatticeGaugeField Umu(grid); + + ident =1.0; + + // RNG set up for test + std::vector pseeds({1,2,3,4,5}); // once I caught a fish alive + std::vector sseeds({6,7,8,9,10});// then i let it go again + GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds); + + SU::HotConfiguration(pRNG,Umu); + + U = PeekIndex(Umu,0); + org=U; + + + tmp= U*adj(U) - ident ; + RealD Def1 = norm2( tmp ); + std::cout << " Defect1 "< Date: Sat, 31 Oct 2020 18:12:47 -0400 Subject: [PATCH 045/201] Project on group fix on GPU tracked to reciprocal sqrt collision between CUDA and Grid rsqrt --- Grid/lattice/Lattice_ET.h | 2 -- Grid/simd/Grid_vector_unops.h | 8 -------- Grid/tensors/Tensor_Ta.h | 7 ++++++- Grid/tensors/Tensor_unary.h | 1 - 4 files changed, 6 insertions(+), 12 deletions(-) diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index f828ef30..4a8a7423 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -350,7 +350,6 @@ GridUnopClass(UnaryTimesI, timesI(a)); GridUnopClass(UnaryTimesMinusI, timesMinusI(a)); GridUnopClass(UnaryAbs, abs(a)); GridUnopClass(UnarySqrt, sqrt(a)); -GridUnopClass(UnaryRsqrt, rsqrt(a)); GridUnopClass(UnarySin, sin(a)); GridUnopClass(UnaryCos, cos(a)); GridUnopClass(UnaryAsin, asin(a)); @@ -463,7 +462,6 @@ GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI); GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the // abs-fabs-dabs-labs thing GRID_DEF_UNOP(sqrt, UnarySqrt); -GRID_DEF_UNOP(rsqrt, UnaryRsqrt); GRID_DEF_UNOP(sin, UnarySin); GRID_DEF_UNOP(cos, UnaryCos); GRID_DEF_UNOP(asin, UnaryAsin); diff --git a/Grid/simd/Grid_vector_unops.h b/Grid/simd/Grid_vector_unops.h index d225699b..b89bb785 100644 --- a/Grid/simd/Grid_vector_unops.h +++ b/Grid/simd/Grid_vector_unops.h @@ -125,14 +125,6 @@ accelerator_inline Grid_simd sqrt(const Grid_simd &r) { return SimdApply(SqrtRealFunctor(), r); } template -accelerator_inline Grid_simd rsqrt(const Grid_simd &r) { - return SimdApply(RSqrtRealFunctor(), r); -} -template -accelerator_inline Scalar rsqrt(const Scalar &r) { - return (RSqrtRealFunctor(), r); -} -template accelerator_inline Grid_simd cos(const Grid_simd &r) { return SimdApply(CosRealFunctor(), r); } diff --git a/Grid/tensors/Tensor_Ta.h b/Grid/tensors/Tensor_Ta.h index 1ef9fc23..f7af85b7 100644 --- a/Grid/tensors/Tensor_Ta.h +++ b/Grid/tensors/Tensor_Ta.h @@ -92,17 +92,22 @@ accelerator_inline iMatrix ProjectOnGroup(const iMatrix &arg) { // need a check for the group type? iMatrix ret(arg); + vtype rnrm; vtype nrm; vtype inner; for(int c1=0;c1 Date: Sat, 31 Oct 2020 18:14:31 -0400 Subject: [PATCH 046/201] Hip Free managed --- Grid/threads/Accelerator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index da8a85b0..d1a96266 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -328,7 +328,7 @@ inline void *acceleratorAllocDevice(size_t bytes) return ptr; }; -inline void acceleratorFreeShared(void *ptr){ free(ptr);}; +inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} From 5eeabaa2bb3f65f911817a4783fc43a45180baa5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 1 Nov 2020 01:16:01 +0000 Subject: [PATCH 047/201] HIP fix --- Grid/lattice/Lattice_basis.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index af9d7280..95f55d10 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) basis_v.push_back(basis[k].View(AcceleratorWrite)); } -#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) ) +#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) ) int max_threads = thread_max(); Vector < vobj > Bt(Nm * max_threads); thread_region @@ -161,11 +161,12 @@ void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,in double * Qt_j = & Qt_jv[0]; for(int k=0;koSites(),vobj::Nsimd(),{ auto B=coalescedRead(zz); for(int k=k0; k Date: Tue, 3 Nov 2020 12:41:35 +0000 Subject: [PATCH 048/201] added Xi-to-Sigma rare decays --- Grid/qcd/utils/BaryonUtils.h | 169 +++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 15516b56..e6b52a43 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -1292,4 +1292,173 @@ void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, } );//end loop over lattice sites } + +/*********************************************************************** + * The following code is for Xi -> Sigma rare hypeon decays * + **********************************************************************/ + +/* Dq_loop is a quark line from t_H to t_H + * Dd_spec is a quark line from t_i to t_f + * Ds_spec is a quark line from t_i to t_f + * Dd_tf is a quark line from t_f to t_H + * Ds_ti is a quark line from t_i to t_H */ +template +template +void BaryonUtils::XiToSigmaQ1EyeSite(const mobj &Dq_loop, + const mobj2 &Dd_spec, + const mobj2 &Ds_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_xi, + const Gamma GammaB_sigma, + robj &result) +{ + + Gamma g5(Gamma::Algebra::Gamma5); + + auto DdG = Dd_spec * GammaB_sigma; + auto GDs = GammaB_xi * Ds_spec; + // Ds * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) + auto DsGDd = Ds_ti * Gamma_H * g5 * adj(Dd_tf) * g5; + // DsGDd * GammaB + auto DsGDdG = DsGDd * GammaB_sigma; + // GammaB * DsGDd + auto GDsGDd = GammaB_xi * DsGDd; + // GammaB * DsGDd * GammaB + auto GDsGDdG = GDsGDd * GammaB_sigma; + // \gamma_\mu^L * Dq_loop + auto trGDq = trace(GammaH * Dq_loop); + + for (int ie_s=0; ie_s < 6 ; ie_s++){ + int a_s = epsilon[ie_s][0]; //a + int b_s = epsilon[ie_s][1]; //b + int c_s = epsilon[ie_s][2]; //c + for (int ie_x=0; ie_x < 6 ; ie_x++){ + int a_x = epsilon[ie_x][0]; //a' + int b_x = epsilon[ie_x][1]; //b' + int c_x = epsilon[ie_x][2]; //c' + auto ee_GD = epsilon_sgn[ie_s] * epsilon_sgn[ie_x] * trGDq; + for (int alpha_x=0; alpha_x +template +void BaryonUtils::XiToSigmaQ2EyeSite(const mobj &Dq_loop, + const mobj2 &Dd_spec, + const mobj2 &Ds_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_xi, + const Gamma GammaB_sigma, + robj &result) +{ + + Gamma g5(Gamma::Algebra::Gamma5); + + auto DdG = Dd_spec * GammaB_sigma; + auto GDs = GammaB_xi * Ds_spec; + // Ds * \gamma_\mu^L * Dq_loop * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) + auto DsGDqGDd = Ds_ti * Gamma_H * Dq_loop * Gamma_H * g5 * adj(Dd_tf) * g5; + // DsGDd * GammaB + auto DsGDqGDdG = DsGDqGDd * GammaB_sigma; + // GammaB * DsGDd + auto GDsGDqGDd = GammaB_xi * DsGDqGDd; + // GammaB * DsGDd * GammaB + auto GDsGDqGDdG = GDsGDqGDd * GammaB_sigma; + + for (int ie_s=0; ie_s < 6 ; ie_s++){ + int a_s = epsilon[ie_s][0]; //a + int b_s = epsilon[ie_s][1]; //b + int c_s = epsilon[ie_s][2]; //c + for (int ie_x=0; ie_x < 6 ; ie_x++){ + int a_x = epsilon[ie_x][0]; //a' + int b_x = epsilon[ie_x][1]; //b' + int c_x = epsilon[ie_x][2]; //c' + auto ee = epsilon_sgn[ie_s] * epsilon_sgn[ie_x]; + for (int alpha_x=0; alpha_x +template +void BaryonUtils::XiToSigmaEye(const PropagatorField &qq_loop, + const mobj &Dd_spec, + const mobj &Ds_spec, + const PropagatorField &qd_tf, + const PropagatorField &qs_ti, + const Gamma Gamma_H, + const Gamma GammaB_xi, + const Gamma GammaB_sigma, + const std::string op, + SpinMatrixField &stn_corr) +{ + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + GridBase *grid = qs_ti.Grid(); + + autoView( vcorr, stn_corr, CpuWrite); + autoView( vq_loop , qq_loop, CpuRead); + autoView( vd_tf , qd_tf, CpuRead); + autoView( vs_ti , qs_ti, CpuRead); + + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_loop = vq_loop[ss]; + auto Dd_tf = vd_tf[ss]; + auto Ds_ti = vs_ti[ss]; + sobj result=Zero(); + if(op == "Q1"){ + XiToSigmaQ1EyeSite(Dq_loop,Dd_spec,Ds_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_xi,GammaB_sigma,result); + } else if(op == "Q2"){ + XiToSigmaQ2EyeSite(Dq_loop,Dd_spec,Ds_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_xi,GammaB_sigma,result); + } else { + assert(0 && "Weak Operator not correctly specified"); + } + vcorr[ss] = result; + } );//end loop over lattice sites +} + + NAMESPACE_END(Grid); From a3de7026c8d091b4d42eba5e5cb8e57419e0963e Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Tue, 3 Nov 2020 12:51:50 +0000 Subject: [PATCH 049/201] bugfix --- Grid/qcd/utils/BaryonUtils.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index e6b52a43..8e6bf722 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -207,6 +207,26 @@ public: const Gamma GammaB_sigma, const Gamma GammaB_nucl, robj &result); + template + static void XiToSigmaQ1EyeSite(const mobj &Dq_loop, + const mobj2 &Dd_spec, + const mobj2 &Ds_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); + template + static void XiToSigmaQ2EyeSite(const mobj &Dq_loop, + const mobj2 &Dd_spec, + const mobj2 &Ds_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); public: template static void SigmaToNucleonEye(const PropagatorField &qq_loop, @@ -229,6 +249,17 @@ public: const Gamma GammaB_nucl, const std::string op, SpinMatrixField &stn_corr); + template + static void XiToSigmaEye(const PropagatorField &qq_loop, + const mobj &Dd_spec, + const mobj &Ds_spec, + const PropagatorField &qd_tf, + const PropagatorField &qs_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + const std::string op, + SpinMatrixField &stn_corr); }; template From 67023c334b16938c36af9f41647ba42891008e03 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Tue, 3 Nov 2020 13:07:37 +0000 Subject: [PATCH 050/201] bugfix --- Grid/qcd/utils/BaryonUtils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 8e6bf722..a281a669 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -1359,7 +1359,7 @@ void BaryonUtils::XiToSigmaQ1EyeSite(const mobj &Dq_loop, // GammaB * DsGDd * GammaB auto GDsGDdG = GDsGDd * GammaB_sigma; // \gamma_\mu^L * Dq_loop - auto trGDq = trace(GammaH * Dq_loop); + auto trGDq = trace(Gamma_H * Dq_loop); for (int ie_s=0; ie_s < 6 ; ie_s++){ int a_s = epsilon[ie_s][0]; //a From 4014dfd5b9a79223023ab2bee3789a7efde45945 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Tue, 3 Nov 2020 16:13:08 +0000 Subject: [PATCH 051/201] first tested version --- Grid/qcd/utils/BaryonUtils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index a281a669..f5bc1480 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -1359,7 +1359,7 @@ void BaryonUtils::XiToSigmaQ1EyeSite(const mobj &Dq_loop, // GammaB * DsGDd * GammaB auto GDsGDdG = GDsGDd * GammaB_sigma; // \gamma_\mu^L * Dq_loop - auto trGDq = trace(Gamma_H * Dq_loop); + auto trGDq = TensorRemove(trace(Gamma_H * Dq_loop)); for (int ie_s=0; ie_s < 6 ; ie_s++){ int a_s = epsilon[ie_s][0]; //a @@ -1369,7 +1369,7 @@ void BaryonUtils::XiToSigmaQ1EyeSite(const mobj &Dq_loop, int a_x = epsilon[ie_x][0]; //a' int b_x = epsilon[ie_x][1]; //b' int c_x = epsilon[ie_x][2]; //c' - auto ee_GD = epsilon_sgn[ie_s] * epsilon_sgn[ie_x] * trGDq; + auto ee_GD = epsilon_sgn[ie_s] * epsilon_sgn[ie_x] * trGDq; for (int alpha_x=0; alpha_x Date: Tue, 3 Nov 2020 20:03:09 +0000 Subject: [PATCH 052/201] speedup in Sigma-to-nucleon --- Grid/qcd/utils/BaryonUtils.h | 193 ++++++++++++++++++++++++++++++++++- 1 file changed, 189 insertions(+), 4 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index f5bc1480..c0999a4a 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -1027,7 +1027,7 @@ void BaryonUtils::BaryonGamma3pt( * Du_spec is a quark line from t_i to t_f * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ -template +/*template template void BaryonUtils::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, @@ -1071,6 +1071,50 @@ void BaryonUtils::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, }} } } +}*/ +template +template +void BaryonUtils::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result) +{ + + Gamma g5(Gamma::Algebra::Gamma5); + + auto DuG = Du_spec * GammaB_nucl; + // Gamma^B * Ds * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) + auto GDsGDd = GammaB_sigma * Ds_ti * Gamma_H * g5 * adj(Dd_tf) * g5; + // Dq_loop * \gamma_\mu^L + auto trDqG = TensorRemove(trace(Dq_loop * Gamma_H)); + + for (int ie_n=0; ie_n < 6 ; ie_n++){ + int a_n = epsilon[ie_n][0]; //a + int b_n = epsilon[ie_n][1]; //b + int c_n = epsilon[ie_n][2]; //c + for (int ie_s=0; ie_s < 6 ; ie_s++){ + int a_s = epsilon[ie_s][0]; //a' + int b_s = epsilon[ie_s][1]; //b' + int c_s = epsilon[ie_s][2]; //c' + for (int alpha_s=0; alpha_s::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, * Du_spec is a quark line from t_i to t_f * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ -template +/*template template void BaryonUtils::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, const mobj &Du_tf, @@ -1130,6 +1174,55 @@ void BaryonUtils::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, }} } } +}*/ +template +template +void BaryonUtils::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, + const mobj &Du_tf, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result) +{ + + Gamma g5(Gamma::Algebra::Gamma5); + + auto DuG = Du_spec * GammaB_nucl; + // Gamma^B * Ds * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) + auto GDsGDd = GammaB_sigma * Ds_ti * Gamma_H * g5 * adj(Dd_tf) * g5; + // Du_ti * \gamma_\mu^L * adj(Du_tf) + auto DuGHDu = Du_ti * Gamma_H * g5 * adj(Du_tf) * g5; + auto DuGHDuG = DuGHDu * GammaB_nucl; + + for (int ie_n=0; ie_n < 6 ; ie_n++){ + int a_n = epsilon[ie_n][0]; //a + int b_n = epsilon[ie_n][1]; //b + int c_n = epsilon[ie_n][2]; //c + for (int ie_s=0; ie_s < 6 ; ie_s++){ + int a_s = epsilon[ie_s][0]; //a' + int b_s = epsilon[ie_s][1]; //b' + int c_s = epsilon[ie_s][2]; //c' + for (int alpha_s=0; alpha_s::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, * Du_spec is a quark line from t_i to t_f * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ -template +/*template template void BaryonUtils::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, @@ -1181,6 +1274,47 @@ void BaryonUtils::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, }}} } } +}*/ +template +template +void BaryonUtils::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result) +{ + + Gamma g5(Gamma::Algebra::Gamma5); + + auto DuG = Du_spec * GammaB_nucl; + // Gamma^B * Ds * \gamma_\mu^L * Dq_loop * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) + auto GDsGDqGDd = GammaB_sigma * Ds_ti * Gamma_H * Dq_loop * Gamma_H * g5 * adj(Dd_tf) * g5; + + for (int ie_n=0; ie_n < 6 ; ie_n++){ + int a_n = epsilon[ie_n][0]; //a + int b_n = epsilon[ie_n][1]; //b + int c_n = epsilon[ie_n][2]; //c + for (int ie_s=0; ie_s < 6 ; ie_s++){ + int a_s = epsilon[ie_s][0]; //a' + int b_s = epsilon[ie_s][1]; //b' + int c_s = epsilon[ie_s][2]; //c' + for (int alpha_s=0; alpha_s::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, * Du_spec is a quark line from t_i to t_f * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ -template +/*template template void BaryonUtils::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, const mobj &Du_tf, @@ -1240,6 +1374,57 @@ void BaryonUtils::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, }}} } } +}*/ +template +template +void BaryonUtils::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, + const mobj &Du_tf, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result) +{ + + Gamma g5(Gamma::Algebra::Gamma5); + + auto DuG = Du_spec * GammaB_nucl; + // Gamma^B * Ds * \gamma_\mu^L * adj(Du) + auto GDsGDu = GammaB_sigma * Ds_ti * Gamma_H * g5 * adj(Du_tf) * g5; + // GDsGDu * GammaB + auto GDsGDuG = GDsGDu * GammaB_nucl; + // Du * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) + auto DuGDd = Du_ti * Gamma_H * g5 * adj(Dd_tf) * g5; + + for (int ie_n=0; ie_n < 6 ; ie_n++){ + int a_n = epsilon[ie_n][0]; //a + int b_n = epsilon[ie_n][1]; //b + int c_n = epsilon[ie_n][2]; //c + for (int ie_s=0; ie_s < 6 ; ie_s++){ + int a_s = epsilon[ie_s][0]; //a' + int b_s = epsilon[ie_s][1]; //b' + int c_s = epsilon[ie_s][2]; //c' + auto ee = epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; + for (int alpha_s=0; alpha_s From 3594ce877beb22161dcce972f14bd89598972fd6 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Tue, 3 Nov 2020 20:04:30 +0000 Subject: [PATCH 053/201] speedup in Sigma-to-nucleon --- Grid/qcd/utils/BaryonUtils.h | 196 ----------------------------------- 1 file changed, 196 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index c0999a4a..8a4ff6ac 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -1027,51 +1027,6 @@ void BaryonUtils::BaryonGamma3pt( * Du_spec is a quark line from t_i to t_f * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ -/*template -template -void BaryonUtils::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result) -{ - - Gamma g5(Gamma::Algebra::Gamma5); - - auto DuG = Du_spec * GammaB_nucl; - // Gamma^B * Ds * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) - auto GDsGDd = GammaB_sigma * Ds_ti * Gamma_H * g5 * adj(Dd_tf) * g5; - // Dq_loop * \gamma_\mu^L - auto DqG = Dq_loop * Gamma_H; - - for (int ie_n=0; ie_n < 6 ; ie_n++){ - int a_n = epsilon[ie_n][0]; //a - int b_n = epsilon[ie_n][1]; //b - int c_n = epsilon[ie_n][2]; //c - for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = epsilon[ie_s][0]; //a' - int b_s = epsilon[ie_s][1]; //b' - int c_s = epsilon[ie_s][2]; //c' - for (int alpha_s=0; alpha_s template void BaryonUtils::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, @@ -1122,59 +1077,6 @@ void BaryonUtils::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, * Du_spec is a quark line from t_i to t_f * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ -/*template -template -void BaryonUtils::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, - const mobj &Du_tf, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result) -{ - - Gamma g5(Gamma::Algebra::Gamma5); - - auto DuG = Du_spec * GammaB_nucl; - auto adjDu = g5 * adj(Du_tf) * g5; - auto adjDuG = adjDu * GammaB_nucl; - // Gamma^B * Ds * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) - auto GDsGDd = GammaB_sigma * Ds_ti * Gamma_H * g5 * adj(Dd_tf) * g5; - // Dq_loop * \gamma_\mu^L - auto DuGH = Du_ti * Gamma_H; - - for (int ie_n=0; ie_n < 6 ; ie_n++){ - int a_n = epsilon[ie_n][0]; //a - int b_n = epsilon[ie_n][1]; //b - int c_n = epsilon[ie_n][2]; //c - for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = epsilon[ie_s][0]; //a' - int b_s = epsilon[ie_s][1]; //b' - int c_s = epsilon[ie_s][2]; //c' - for (int alpha_s=0; alpha_s template void BaryonUtils::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, @@ -1230,51 +1132,6 @@ void BaryonUtils::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, * Du_spec is a quark line from t_i to t_f * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ -/*template -template -void BaryonUtils::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result) -{ - - Gamma g5(Gamma::Algebra::Gamma5); - - auto DuG = Du_spec * GammaB_nucl; - // Gamma^B * Ds * \gamma_\mu^L - auto GDsG = GammaB_sigma * Ds_ti * Gamma_H; - // Dq_loop * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) - auto DqGDd = Dq_loop * Gamma_H * g5 * adj(Dd_tf) * g5; - - for (int ie_n=0; ie_n < 6 ; ie_n++){ - int a_n = epsilon[ie_n][0]; //a - int b_n = epsilon[ie_n][1]; //b - int c_n = epsilon[ie_n][2]; //c - for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = epsilon[ie_s][0]; //a' - int b_s = epsilon[ie_s][1]; //b' - int c_s = epsilon[ie_s][2]; //c' - for (int alpha_s=0; alpha_s template void BaryonUtils::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, @@ -1322,59 +1179,6 @@ void BaryonUtils::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, * Du_spec is a quark line from t_i to t_f * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ -/*template -template -void BaryonUtils::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, - const mobj &Du_tf, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result) -{ - - Gamma g5(Gamma::Algebra::Gamma5); - - auto DuG = Du_spec * GammaB_nucl; - auto adjDu = g5 * adj(Du_tf) * g5; - auto adjDuG = adjDu * GammaB_nucl; - // Gamma^B * Ds * \gamma_\mu^L - auto GDsG = GammaB_sigma * Ds_ti * Gamma_H; - // Du * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) - auto DuGDd = Du_ti * Gamma_H * g5 * adj(Dd_tf) * g5; - - for (int ie_n=0; ie_n < 6 ; ie_n++){ - int a_n = epsilon[ie_n][0]; //a - int b_n = epsilon[ie_n][1]; //b - int c_n = epsilon[ie_n][2]; //c - for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = epsilon[ie_s][0]; //a' - int b_s = epsilon[ie_s][1]; //b' - int c_s = epsilon[ie_s][2]; //c' - for (int alpha_s=0; alpha_s template void BaryonUtils::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, From 41e28015ae3c86cea5dc41456cff875954837c4e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sat, 7 Nov 2020 13:32:16 +0100 Subject: [PATCH 054/201] Volume divisible guarantee --- benchmarks/Benchmark_ITT.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index eb275728..538366d7 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -334,8 +334,9 @@ public: int threads = GridThread::GetThreads(); Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); - GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi()); uint64_t NP = TmpGrid->RankCount(); @@ -343,7 +344,6 @@ public: NN_global=NN; uint64_t SHM=NP/NN; - Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); ///////// Welcome message //////////// std::cout< Date: Thu, 12 Nov 2020 20:29:58 +0100 Subject: [PATCH 055/201] Host memory explict --- benchmarks/Benchmark_comms.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 232030c8..ccffb564 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -94,8 +94,8 @@ int main (int argc, char ** argv) RealD Nnode = Grid.NodeCount(); RealD ppn = Nrank/Nnode; - std::vector > xbuf(8); - std::vector > rbuf(8); + std::vector > xbuf(8); + std::vector > rbuf(8); for(int mu=0;mu<8;mu++){ xbuf[mu].resize(lat*lat*lat*Ls); From 50b808ab33af9d12d47189f723802c6c31a9aa69 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 12 Nov 2020 22:28:12 +0100 Subject: [PATCH 056/201] Configure option between host and device --- Grid/cshift/Cshift_common.h | 88 +++++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 9 deletions(-) diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index b0dd068d..f2f39815 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -35,7 +35,7 @@ extern Vector > Cshift_table; // Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// template void -Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimension,int plane,int cbmask, int off=0) +Gather_plane_simple (const Lattice &rhs,cshiftVector &buffer,int dimension,int plane,int cbmask, int off=0) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen } } { - autoView(rhs_v , rhs, AcceleratorRead); auto buffer_p = & buffer[0]; auto table = &Cshift_table[0]; +#ifdef ACCELERATOR_CSHIFT + autoView(rhs_v , rhs, AcceleratorRead); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); }); +#else + autoView(rhs_v , rhs, CpuRead); + thread_for(i,ent,{ + buffer_p[table[i].first]=rhs_v[table[i].second]; + }); +#endif } } @@ -103,6 +110,7 @@ Gather_plane_extract(const Lattice &rhs, int n1=rhs.Grid()->_slice_stride[dimension]; if ( cbmask ==0x3){ +#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); accelerator_for2d(n,e1,b,e2,1,{ int o = n*n1; @@ -111,12 +119,22 @@ Gather_plane_extract(const Lattice &rhs, vobj temp =rhs_v[so+o+b]; extract(temp,pointers,offset); }); +#else + autoView(rhs_v , rhs, CpuRead); + thread_for2d(n,e1,b,e2,{ + int o = n*n1; + int offset = b+n*e2; + + vobj temp =rhs_v[so+o+b]; + extract(temp,pointers,offset); + }); +#endif } else { - autoView(rhs_v , rhs, AcceleratorRead); - Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate cdm =rhs.Grid()->_checker_dim_mask; std::cout << " Dense packed buffer WARNING " < &rhs, extract(temp,pointers,offset); } }); +#else + autoView(rhs_v , rhs, CpuRead); + thread_for2d(n,e1,b,e2,{ + + Coordinate coor; + + int o=n*n1; + int oindex = o+b; + + int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm); + + int ocb=1<(temp,pointers,offset); + } + }); +#endif } } ////////////////////////////////////////////////////// // Scatter for when there is no need to SIMD split ////////////////////////////////////////////////////// -template void Scatter_plane_simple (Lattice &rhs,commVector &buffer, int dimension,int plane,int cbmask) +template void Scatter_plane_simple (Lattice &rhs,cshiftVector &buffer, int dimension,int plane,int cbmask) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -182,12 +220,19 @@ template void Scatter_plane_simple (Lattice &rhs,commVector void Scatter_plane_merge(Lattice &rhs,ExtractPointerA int e2=rhs.Grid()->_slice_block[dimension]; if(cbmask ==0x3 ) { - autoView( rhs_v , rhs, AcceleratorWrite); int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension]; +#ifdef ACCELERATOR_CSHIFT + autoView( rhs_v , rhs, AcceleratorWrite); accelerator_for2d(n,e1,b,e2,1,{ int o = n*_slice_stride; int offset = b+n*_slice_block; merge(rhs_v[so+o+b],pointers,offset); }); +#else + autoView( rhs_v , rhs, CpuWrite); + thread_for2d(n,e1,b,e2,{ + int o = n*_slice_stride; + int offset = b+n*_slice_block; + merge(rhs_v[so+o+b],pointers,offset); + }); +#endif } else { // Case of SIMD split AND checker dim cannot currently be hit, except in @@ -280,12 +334,20 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs } { + auto table = &Cshift_table[0]; +#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); autoView(lhs_v , lhs, AcceleratorWrite); - auto table = &Cshift_table[0]; accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); }); +#else + autoView(rhs_v , rhs, CpuRead); + autoView(lhs_v , lhs, CpuWrite); + thread_for(i,ent,{ + lhs_v[table[i].first]=rhs_v[table[i].second]; + }); +#endif } } @@ -324,12 +386,20 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice Date: Thu, 12 Nov 2020 22:54:27 +0100 Subject: [PATCH 057/201] Option for bounce through the SHM buffer --- Grid/cshift/Cshift_mpi.h | 223 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 213 insertions(+), 10 deletions(-) diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 0f0e80b1..375d004e 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -101,7 +101,8 @@ template void Cshift_comms_simd(Lattice& ret,const Lattice void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) { typedef typename vobj::vector_type vector_type; @@ -121,9 +122,9 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - commVector send_buf(buffer_size); - commVector recv_buf(buffer_size); - + cshiftVector send_buf(buffer_size); + cshiftVector recv_buf(buffer_size); + int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); @@ -138,7 +139,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r } else { - int words = send_buf.size(); + int words = buffer_size; if (cbmask != 0x3) words=words>>1; int bytes = words * sizeof(vobj); @@ -150,12 +151,14 @@ template void Cshift_comms(Lattice &ret,const Lattice &r int xmit_to_rank; grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + grid->Barrier(); grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, (void *)&recv_buf[0], recv_from_rank, bytes); + grid->Barrier(); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); @@ -195,8 +198,15 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - std::vector > send_buf_extract(Nsimd,commVector(buffer_size) ); - std::vector > recv_buf_extract(Nsimd,commVector(buffer_size) ); + std::vector > send_buf_extract(Nsimd); + std::vector > recv_buf_extract(Nsimd); + scalar_object * recv_buf_extract_mpi; + scalar_object * send_buf_extract_mpi; + + for(int s=0;s void Cshift_comms_simd(Lattice &ret,const LatticeShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0], + grid->Barrier(); + + send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; + recv_buf_extract_mpi = &recv_buf_extract[i][0]; + grid->SendToRecvFrom((void *)send_buf_extract_mpi, xmit_to_rank, - (void *)&recv_buf_extract[i][0], + (void *)recv_buf_extract_mpi, recv_from_rank, bytes); + + grid->Barrier(); + + rpointers[i] = &recv_buf_extract[i][0]; + } else { + rpointers[i] = &send_buf_extract[nbr_lane][0]; + } + + } + Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); + } + +} +#else +template void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) +{ + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + GridBase *grid=rhs.Grid(); + Lattice temp(rhs.Grid()); + + int fd = rhs.Grid()->_fdimensions[dimension]; + int rd = rhs.Grid()->_rdimensions[dimension]; + int pd = rhs.Grid()->_processors[dimension]; + int simd_layout = rhs.Grid()->_simd_layout[dimension]; + int comm_dim = rhs.Grid()->_processors[dimension] >1 ; + assert(simd_layout==1); + assert(comm_dim==1); + assert(shift>=0); + assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; + cshiftVector send_buf_v(buffer_size); + cshiftVector recv_buf_v(buffer_size); + vobj *send_buf; + vobj *recv_buf; + { + grid->ShmBufferFreeAll(); + size_t bytes = buffer_size*sizeof(vobj); + send_buf=(vobj *)grid->ShmBufferMalloc(bytes); + recv_buf=(vobj *)grid->ShmBufferMalloc(bytes); + } + + int cb= (cbmask==0x2)? Odd : Even; + int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); + + for(int x=0;x>1; + + int bytes = words * sizeof(vobj); + + Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); + + // int rank = grid->_processor; + int recv_from_rank; + int xmit_to_rank; + grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + + + grid->Barrier(); + + acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); + grid->SendToRecvFrom((void *)&send_buf[0], + xmit_to_rank, + (void *)&recv_buf[0], + recv_from_rank, + bytes); + acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); + + grid->Barrier(); + + Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); + } + } +} + +template void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) +{ + GridBase *grid=rhs.Grid(); + const int Nsimd = grid->Nsimd(); + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_object scalar_object; + typedef typename vobj::scalar_type scalar_type; + + int fd = grid->_fdimensions[dimension]; + int rd = grid->_rdimensions[dimension]; + int ld = grid->_ldimensions[dimension]; + int pd = grid->_processors[dimension]; + int simd_layout = grid->_simd_layout[dimension]; + int comm_dim = grid->_processors[dimension] >1 ; + + //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<=0); + assert(shiftPermuteType(dimension); + + /////////////////////////////////////////////// + // Simd direction uses an extract/merge pair + /////////////////////////////////////////////// + int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; + // int words = sizeof(vobj)/sizeof(vector_type); + + std::vector > send_buf_extract(Nsimd); + std::vector > recv_buf_extract(Nsimd); + scalar_object * recv_buf_extract_mpi; + scalar_object * send_buf_extract_mpi; + { + size_t bytes = sizeof(scalar_object)*buffer_size; + grid->ShmBufferFreeAll(); + send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); + recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); + } + for(int s=0;s pointers(Nsimd); // + ExtractPointerArray rpointers(Nsimd); // received pointers + + /////////////////////////////////////////// + // Work out what to send where + /////////////////////////////////////////// + int cb = (cbmask==0x2)? Odd : Even; + int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); + + // loop over outer coord planes orthog to dim + for(int x=0;x>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; + + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; + int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors + + int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer + int nbr_ox = (nbr_coor%rd); // outer coord of peer + int nbr_lane = (i&(~inner_bit)); + + int recv_from_rank; + int xmit_to_rank; + + if (nbr_ic) nbr_lane|=inner_bit; + + assert (sx == nbr_ox); + + if(nbr_proc){ + grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + grid->Barrier(); + + acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); + grid->SendToRecvFrom((void *)send_buf_extract_mpi, + xmit_to_rank, + (void *)recv_buf_extract_mpi, + recv_from_rank, + bytes); + acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); + grid->Barrier(); rpointers[i] = &recv_buf_extract[i][0]; } else { @@ -258,7 +461,7 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice Date: Fri, 13 Nov 2020 01:38:54 +0100 Subject: [PATCH 058/201] Option for host or device Cshift implementation --- Grid/allocator/AlignedAllocator.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 249732fb..4b357523 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -165,9 +165,17 @@ template inline bool operator!=(const devAllocator<_Tp>&, const d //////////////////////////////////////////////////////////////////////////////// // Template typedefs //////////////////////////////////////////////////////////////////////////////// -//template using commAllocator = devAllocator; +#ifdef ACCELERATOR_CSHIFT +// Cshift on device +template using cshiftAllocator = devAllocator; +#else +// Cshift on host +template using cshiftAllocator = std::allocator; +#endif + template using Vector = std::vector >; template using commVector = std::vector >; +template using cshiftVector = std::vector >; NAMESPACE_END(Grid); From b13d1f72389a8a7b9cde67f1426bd2ae1b3a2e02 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Nov 2020 03:49:44 +0100 Subject: [PATCH 059/201] TOFU compat flag to help Isaaku --- Grid/GridStd.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Grid/GridStd.h b/Grid/GridStd.h index ecb561ea..28f6bc46 100644 --- a/Grid/GridStd.h +++ b/Grid/GridStd.h @@ -28,4 +28,7 @@ /////////////////// #include "Config.h" +#ifdef TOFU +#undef GRID_COMMS_THREADS +#endif #endif /* GRID_STD_H */ From 6e313575bed12d65f40785d5bc8af05d6a6e5a6f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Nov 2020 03:50:16 +0100 Subject: [PATCH 060/201] Use of default GPU is behaviour, not a system property. Move Summit specific to configure.ac --- Grid/threads/Accelerator.cc | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 2134d158..bd13e04c 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -48,7 +48,7 @@ void acceleratorInit(void) prop = gpu_props[i]; totalDeviceMem = prop.totalGlobalMem; if ( world_rank == 0) { -#ifndef GRID_IBM_SUMMIT +#ifndef GRID_DEFAULT_GPU if ( i==rank ) { printf("AcceleratorCudaInit[%d]: ========================\n",rank); printf("AcceleratorCudaInit[%d]: Device Number : %d\n", rank,i); @@ -73,11 +73,17 @@ void acceleratorInit(void) #undef GPU_PROP_FMT #undef GPU_PROP -#ifdef GRID_IBM_SUMMIT +#ifdef GRID_DEFAULT_GPU // IBM Jsrun makes cuda Device numbering screwy and not match rank - if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - use default device\n"); + if ( world_rank == 0 ) { + printf("AcceleratorCudaInit: using default device \n"); + printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n"); + printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n"); + printf("AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no \n"); + } #else printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank); + printf("AcceleratorCudaInit: Configure options --enable-select-gpu=yes \n"); cudaSetDevice(rank); #endif if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n"); @@ -139,11 +145,18 @@ void acceleratorInit(void) MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours #undef GPU_PROP_FMT #undef GPU_PROP -#ifdef GRID_IBM_SUMMIT - // IBM Jsrun makes cuda Device numbering screwy and not match rank - if ( world_rank == 0 ) printf("AcceleratorHipInit: IBM Summit or similar - NOT setting device to node rank\n"); + +#ifdef GRID_DEFAULT_GPU + if ( world_rank == 0 ) { + printf("AcceleratorHipInit: using default device \n"); + printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n"); + printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n"); + } #else - if ( world_rank == 0 ) printf("AcceleratorHipInit: setting device to node rank\n"); + if ( world_rank == 0 ) { + printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank); + printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n"); + } hipSetDevice(rank); #endif if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n"); From cf23eff60eb387d26d490fbbe8ed8ba0e32776cd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Nov 2020 03:51:08 +0100 Subject: [PATCH 061/201] Device to Device, Memset, cannot assume UVM == Communicable --- Grid/threads/Accelerator.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index da8a85b0..75d557fd 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -166,15 +166,18 @@ inline void *acceleratorAllocDevice(size_t bytes) inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} +inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);} inline int acceleratorIsCommunicable(void *ptr) { - int uvm; - auto - cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr); - assert(cuerr == cudaSuccess ); - if(uvm) return 0; - else return 1; + // int uvm=0; + // auto + // cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr); + // assert(cuerr == cudaSuccess ); + // if(uvm) return 0; + // else return 1; + return 1; } #endif @@ -229,8 +232,10 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; +inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();} inline int acceleratorIsCommunicable(void *ptr) { #if 0 @@ -332,6 +337,8 @@ inline void acceleratorFreeShared(void *ptr){ free(ptr);}; inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} +inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);} #endif @@ -369,8 +376,10 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemc accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} +inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline int acceleratorIsCommunicable(void *ptr){ return 1; } +inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} #ifdef HAVE_MM_MALLOC_H inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; @@ -393,6 +402,8 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN, inline void acceleratorFreeCpu (void *ptr){free(ptr);}; #endif + + /////////////////////////////////////////////////// // Synchronise across local threads for divergence resynch /////////////////////////////////////////////////// From d05ce01809444087cceaf22c86c37d8977024c2a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Nov 2020 03:52:19 +0100 Subject: [PATCH 062/201] TOFU behaviour now optional THREAD_MULTIPLE or THREAD_SERIALIZED --- Grid/communicator/Communicator_mpi3.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 83f71233..c6543851 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -44,7 +44,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { -#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori +#ifndef GRID_COMMS_THREADS nCommThreads=1; // wrong results here too // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs @@ -358,16 +358,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector Date: Fri, 13 Nov 2020 03:57:58 +0100 Subject: [PATCH 063/201] Work on 2,2,2,8 ranks --- benchmarks/Benchmark_ITT.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 538366d7..8495bbc5 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -445,7 +445,11 @@ public: // 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8 // 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2 // double flops=(1344.0*volume)/2; +#if 1 double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2; +#else + double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns + 2*Nd*Nc*Ns*2; +#endif double flops=(fps*volume)/2; double mf_hi, mf_lo, mf_err; @@ -498,8 +502,9 @@ public: int threads = GridThread::GetThreads(); Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); - GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi()); uint64_t NP = TmpGrid->RankCount(); From e9bc7488280a824750279bdfafcd2891eeb65c37 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Nov 2020 03:58:34 +0100 Subject: [PATCH 064/201] Useful GPU machine benchmark for GDR used to shakeout Booster at Juelich - see slack earlyaccess channel --- benchmarks/Benchmark_comms_host_device.cc | 260 ++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 benchmarks/Benchmark_comms_host_device.cc diff --git a/benchmarks/Benchmark_comms_host_device.cc b/benchmarks/Benchmark_comms_host_device.cc new file mode 100644 index 00000000..591b5597 --- /dev/null +++ b/benchmarks/Benchmark_comms_host_device.cc @@ -0,0 +1,260 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_comms.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +struct time_statistics{ + double mean; + double err; + double min; + double max; + + void statistics(std::vector v){ + double sum = std::accumulate(v.begin(), v.end(), 0.0); + mean = sum / v.size(); + + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); + + auto result = std::minmax_element(v.begin(), v.end()); + min = *result.first; + max = *result.second; +} +}; + +void header(){ + std::cout <1) nmu++; + + std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; + std::vector t_time(Nloop); + time_statistics timestat; + + std::cout< > xbuf(8); + std::vector > rbuf(8); + + for(int mu=0;mu<8;mu++){ + xbuf[mu].resize(lat*lat*lat*Ls); + rbuf[mu].resize(lat*lat*lat*Ls); + } + uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + + int ncomm; + + for(int mu=0;mu<4;mu++){ + if (mpi_layout[mu]>1 ) { + double start=usecond(); + for(int i=0;i requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); + } + + comm_proc = mpi_layout[mu]-1; + { + std::vector requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); + } + } + Grid.Barrier(); + double stop=usecond(); + double mean=(stop-start)/Nloop; + double dbytes = bytes*ppn; + double xbytes = dbytes*2.0*ncomm; + double rbytes = xbytes; + double bidibytes = xbytes+rbytes; + + std::cout< xbuf(8); + std::vector rbuf(8); + + uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + } + + int ncomm; + + for(int mu=0;mu<4;mu++){ + if (mpi_layout[mu]>1 ) { + double start=usecond(); + for(int i=0;i requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); + } + + comm_proc = mpi_layout[mu]-1; + { + std::vector requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); + } + } + Grid.Barrier(); + double stop=usecond(); + double mean=(stop-start)/Nloop; + double dbytes = bytes*ppn; + double xbytes = dbytes*2.0*ncomm; + double rbytes = xbytes; + double bidibytes = xbytes+rbytes; + + std::cout< Date: Fri, 13 Nov 2020 03:59:36 +0100 Subject: [PATCH 065/201] Must ask for COMMMS_THREADS --- Grid/util/Init.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 37d16176..9be39e94 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -473,11 +473,13 @@ void Grid_init(int *argc,char ***argv) LebesgueOrder::UseLebesgueOrder=1; } CartesianCommunicator::nCommThreads = 1; +#ifdef GRID_COMMS_THREADS if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads"); GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads); assert(CartesianCommunicator::nCommThreads > 0); } +#endif if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); GridCmdOptionIntVector(arg,LebesgueOrder::Block); From 18ef8056ecb846245ab8b6a7d3071dcbbea889af Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 13 Nov 2020 04:10:40 +0100 Subject: [PATCH 066/201] Hide Shared Memory --- Grid/communicator/SharedMemoryMPI.cc | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 0cbde9eb..5200b65c 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -457,8 +457,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; exit(EXIT_FAILURE); } - if ( WorldRank == 0 ){ - std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes + // if ( WorldRank == 0 ){ + if ( 1 ){ + std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes << "bytes at "<< std::hex<< ShmCommBuf < ranks(size); for(int r=0;r Date: Fri, 13 Nov 2020 04:11:03 +0100 Subject: [PATCH 067/201] Update options and simplify --- configure.ac | 88 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 31 deletions(-) diff --git a/configure.ac b/configure.ac index cee2a84c..9a020a7a 100644 --- a/configure.ac +++ b/configure.ac @@ -153,18 +153,28 @@ case ${ac_SFW_FP16} in AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);; esac -############### SUMMIT JSRUN -AC_ARG_ENABLE([summit], - [AC_HELP_STRING([--enable-summit=yes|no], [enable IBMs jsrun resource manager for SUMMIT])], - [ac_SUMMIT=${enable_summit}], [ac_SUMMIT=no]) -case ${ac_SUMMIT} in - no);; +############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons +AC_ARG_ENABLE([accelerator-cshift], + [AC_HELP_STRING([--enable-accelerator-cshift=yes|no], [run cshift on the device])], + [ac_ACC_CSHIFT=${enable_accelerator_cshift}], [ac_ACC_CSHIFT=yes]) + +AC_ARG_ENABLE([ucx-buggy], + [AC_HELP_STRING([--enable-ucx-buggy=yes|no], [enable workaround for UCX device buffer bugs])], + [ac_UCXBUGGY=${enable_ucx_buggy}], [ac_UCXBUGGY=no]) + +case ${ac_UCXBUGGY} in yes) - AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);; - *) - AC_DEFINE([GRID_IBM_SUMMIT],[1],[Let JSRUN manage the GPU device allocation]);; + ac_ACC_CSHIFT=no;; + *);; esac +case ${ac_ACC_CSHIFT} in + yes) + AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ UCX device buffer bugs are not present]);; + *);; +esac + + ############### SYCL/CUDA/HIP/none AC_ARG_ENABLE([accelerator], [AC_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none], [enable none,cuda,sycl,hip acceleration])], @@ -181,8 +191,9 @@ case ${ac_ACCELERATOR} in echo HIP acceleration AC_DEFINE([GRID_HIP],[1],[Use HIP offload]);; none) - echo NO acceleration - ;; + echo NO acceleration ;; + no) + echo NO acceleration ;; *) AC_MSG_ERROR(["Acceleration not suppoorted ${ac_ACCELERATOR}"]);; esac @@ -477,28 +488,26 @@ esac AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS" AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS" -############### Precision selection - deprecate -#AC_ARG_ENABLE([precision], -# [AC_HELP_STRING([--enable-precision=single|double], -# [Select default word size of Real])], -# [ac_PRECISION=${enable_precision}],[ac_PRECISION=double]) - +###### PRECISION ALWAYS DOUBLE AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] ) -#case ${ac_PRECISION} in -# single) -# AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] ) -# ;; -# double) -# ;; -# *) -# AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]); -# ;; -#esac +######################################################### +###################### set GPU device to rank in node ## +######################################################### +AC_ARG_ENABLE([setdevice],[AC_HELP_STRING([--enable-setdevice | --disable-setdevice], + [Set GPU to rank in node with cudaSetDevice or similar])],[ac_SETDEVICE=${enable_SETDEVICE}],[ac_SETDEVICE=no]) +case ${ac_SETDEVICE} in + yes);; + *) + AC_DEFINE([GRID_DEFAULT_GPU],[1],[GRID_DEFAULT_GPU] ) + ;; +esac -###################### Shared memory allocation technique under MPI3 -AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone], - [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen]) +######################################################### +###################### Shared memory intranode ######### +######################################################### +AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone|nvlink|no], + [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=no]) case ${ac_SHM} in @@ -517,8 +526,12 @@ case ${ac_SHM} in AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] ) ;; - shmnone) + shmnone | no) AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] ) + AC_DEFINE([GRID_SHM_DISABLE],[1],[USE MPI for intranode comms]);; + + nvlink) + AC_DEFINE([GRID_MPI3_SHM_NVLINK],[1],[GRID_MPI3_SHM_NVLINK] ) ;; hugetlbfs) @@ -537,10 +550,23 @@ AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path], [ac_SHMPATH=/var/lib/hugetlbfs/global/pagesize-2MB/]) AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing]) + +############### communication type selection +AC_ARG_ENABLE([comms-threads],[AC_HELP_STRING([--enable-comms-threads | --disable-comms-threads], + [Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes]) + +case ${ac_COMMS_THREADS} in + yes) + AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] ) + ;; + *) ;; +esac + ############### communication type selection AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto], [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none]) + case ${ac_COMMS} in none) AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] ) From 3aab983760b188bbaf4c0523d8f1faa39ed84919 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 16 Nov 2020 17:13:58 +0100 Subject: [PATCH 068/201] Flop count set as in DiRAC-ITT-2020 (mistaken 20% low, but must maintain consistency) --- benchmarks/Benchmark_ITT.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 8ab26fc1..5d602ce9 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -445,7 +445,7 @@ public: // 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8 // 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2 // double flops=(1344.0*volume)/2; -#if 1 +#if 0 double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2; #else double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns + 2*Nd*Nc*Ns*2; @@ -512,7 +512,6 @@ public: NN_global=NN; uint64_t SHM=NP/NN; - Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); ///////// Welcome message //////////// std::cout< Date: Mon, 16 Nov 2020 18:07:15 -0500 Subject: [PATCH 069/201] Switch off SHM paths with --disable-shm --- Grid/communicator/SharedMemoryMPI.cc | 2 +- configure.ac | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 5200b65c..4b440fc0 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -772,7 +772,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) std::vector ranks(size); for(int r=0;r Date: Mon, 16 Nov 2020 20:15:50 -0500 Subject: [PATCH 070/201] --shm-force-mpi --- Grid/communicator/SharedMemoryMPI.cc | 7 ++++--- configure.ac | 9 +++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 4b440fc0..6089093b 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -666,7 +666,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) #endif void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); - // std::cout << "Set WorldShmCommBufs["< ranks(size); for(int r=0;r Date: Tue, 17 Nov 2020 04:41:15 -0800 Subject: [PATCH 071/201] Build without LIME --- benchmarks/Benchmark_IO.cc | 5 ++++- benchmarks/Benchmark_IO.hpp | 2 +- benchmarks/Benchmark_IO_vs_dir.cc | 5 ++++- scripts/filelist | 7 +++---- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/benchmarks/Benchmark_IO.cc b/benchmarks/Benchmark_IO.cc index 0d80d425..87e7224d 100644 --- a/benchmarks/Benchmark_IO.cc +++ b/benchmarks/Benchmark_IO.cc @@ -1,4 +1,3 @@ - #include "Benchmark_IO.hpp" #ifndef BENCH_IO_LMIN @@ -13,6 +12,7 @@ #define BENCH_IO_NPASS 10 #endif +#ifdef HAVE_LIME using namespace Grid; std::string filestem(const int l) @@ -196,3 +196,6 @@ int main (int argc, char ** argv) return EXIT_SUCCESS; } +#else +int main(int argc,char ** argv){} +#endif diff --git a/benchmarks/Benchmark_IO.hpp b/benchmarks/Benchmark_IO.hpp index c4a6ca58..2ff42d52 100644 --- a/benchmarks/Benchmark_IO.hpp +++ b/benchmarks/Benchmark_IO.hpp @@ -2,12 +2,12 @@ #define Benchmark_IO_hpp_ #include -#ifdef HAVE_LIME #define MSG std::cout << GridLogMessage #define SEP \ "-----------------------------------------------------------------------------" #define BIGSEP \ "=============================================================================" +#ifdef HAVE_LIME namespace Grid { diff --git a/benchmarks/Benchmark_IO_vs_dir.cc b/benchmarks/Benchmark_IO_vs_dir.cc index e030bc39..8252547b 100644 --- a/benchmarks/Benchmark_IO_vs_dir.cc +++ b/benchmarks/Benchmark_IO_vs_dir.cc @@ -1,5 +1,5 @@ #include "Benchmark_IO.hpp" - +#ifdef HAVE_LIME using namespace Grid; int main (int argc, char ** argv) @@ -97,3 +97,6 @@ int main (int argc, char ** argv) return EXIT_SUCCESS; } +#else +int main(int argc,char ** argv){} +#endif diff --git a/scripts/filelist b/scripts/filelist index 78747315..27425a3e 100755 --- a/scripts/filelist +++ b/scripts/filelist @@ -26,11 +26,10 @@ for subdir in $dirs; do echo "tests-local: ${TESTLIST} " > Make.inc echo ${PREF}_PROGRAMS = ${TESTLIST} >> Make.inc echo >> Make.inc - HADLINK=`[ $subdir = './hadrons' ] && echo '-lHadrons '` for f in $TESTS; do BNAME=`basename $f .cc` echo ${BNAME}_SOURCES=$f >> Make.inc - echo ${BNAME}_LDADD=${HADLINK}-lGrid >> Make.inc + echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a' >> Make.inc echo >> Make.inc done if [ $subdir != '.' ]; then @@ -49,7 +48,7 @@ echo >> Make.inc for f in $TESTS; do BNAME=`basename $f .cc` echo ${BNAME}_SOURCES=$f >> Make.inc - echo ${BNAME}_LDADD=-lGrid>> Make.inc + echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a' >> Make.inc echo >> Make.inc done cd .. @@ -65,7 +64,7 @@ echo >> Make.inc for f in $TESTS; do BNAME=`basename $f .cc` echo ${BNAME}_SOURCES=$f >> Make.inc - echo ${BNAME}_LDADD=-lGrid>> Make.inc + echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a'>> Make.inc echo >> Make.inc done cd .. From 804a810d68df3dcd285798e4dd325224e1ef2470 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 18 Nov 2020 03:06:53 +0000 Subject: [PATCH 072/201] Wildcard mismatch --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index a387be8a..5b22309d 100644 --- a/configure.ac +++ b/configure.ac @@ -498,7 +498,7 @@ AC_ARG_ENABLE([setdevice],[AC_HELP_STRING([--enable-setdevice | --disable-setdev [Set GPU to rank in node with cudaSetDevice or similar])],[ac_SETDEVICE=${enable_SETDEVICE}],[ac_SETDEVICE=no]) case ${ac_SETDEVICE} in yes);; - *) + no) AC_DEFINE([GRID_DEFAULT_GPU],[1],[GRID_DEFAULT_GPU] ) ;; esac From 5adae5d6ff223fb52b6d69d87d0446f6c865a499 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 19 Nov 2020 19:22:12 +0100 Subject: [PATCH 073/201] Unused variable remove --- Grid/tensors/Tensor_Ta.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Grid/tensors/Tensor_Ta.h b/Grid/tensors/Tensor_Ta.h index f7af85b7..bbaa4a00 100644 --- a/Grid/tensors/Tensor_Ta.h +++ b/Grid/tensors/Tensor_Ta.h @@ -92,7 +92,6 @@ accelerator_inline iMatrix ProjectOnGroup(const iMatrix &arg) { // need a check for the group type? iMatrix ret(arg); - vtype rnrm; vtype nrm; vtype inner; for(int c1=0;c1 Date: Thu, 19 Nov 2020 19:23:03 +0100 Subject: [PATCH 074/201] Warning remove --- Grid/allocator/MemoryManagerShared.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc index 537f7c32..3f165007 100644 --- a/Grid/allocator/MemoryManagerShared.cc +++ b/Grid/allocator/MemoryManagerShared.cc @@ -1,7 +1,6 @@ #include #ifdef GRID_UVM -#warning "Grid is assuming unified virtual memory address space" NAMESPACE_BEGIN(Grid); ///////////////////////////////////////////////////////////////////////////////// // View management is 1:1 address space mapping From d5049949a45cee89cb6ff2de004fe719c36db8b6 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 19 Nov 2020 19:23:41 +0100 Subject: [PATCH 075/201] Starting to fix reunitarise --- Grid/qcd/utils/SUn.h | 84 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h index 7ac53246..69ab4ebb 100644 --- a/Grid/qcd/utils/SUn.h +++ b/Grid/qcd/utils/SUn.h @@ -735,7 +735,6 @@ public: } } - template static void HotConfiguration(GridParallelRNG &pRNG, GaugeField &out) { typedef typename GaugeField::vector_type vector_type; @@ -800,6 +799,89 @@ public: } }; +template +LatticeComplexD Determinant(const Lattice > > > &Umu) +{ + GridBase *grid=Umu.Grid(); + auto lvol = grid->lSites(); + LatticeComplexD ret(grid); + + autoView(Umu_v,Umu,CpuRead); + autoView(ret_v,ret,CpuWrite); + thread_for(site,lvol,{ + Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N); + Coordinate lcoor; + grid->LocalIndexToLocalCoor(site, lcoor); + iScalar > > Us; + peekLocalSite(Us, Umu_v, lcoor); + for(int i=0;i +static void ProjectSUn(Lattice > > > &Umu) +{ + Umu = ProjectOnGroup(Umu); + auto det = Determinant(Umu); + + det = pow(det,-1); + + for(int i=0;i(Umu,N-1,i); + element = element * det; + PokeIndex(Umu,element,Nc-1,i); + } +} +template +static void ProjectSUn(Lattice >,Nd> > &U) +{ + GridBase *grid=U.Grid(); + // Reunitarise + for(int mu=0;mu(U,mu); + Umu = ProjectOnGroup(Umu); + ProjectSUn(Umu); + PokeIndex(U,Umu,mu); + } +} +// Explicit specialisation for SU(3). +// Explicit specialisation for SU(3). +static void +ProjectSU3 (Lattice > > > &Umu) +{ + GridBase *grid=Umu.Grid(); + const int x=0; + const int y=1; + const int z=2; + // Reunitarise + Umu = ProjectOnGroup(Umu); + autoView(Umu_v,Umu,CpuWrite); + thread_for(ss,grid->oSites(),{ + auto cm = Umu_v[ss]; + cm()()(2,x) = adj(cm()()(0,y)*cm()()(1,z)-cm()()(0,z)*cm()()(1,y)); //x= yz-zy + cm()()(2,y) = adj(cm()()(0,z)*cm()()(1,x)-cm()()(0,x)*cm()()(1,z)); //y= zx-xz + cm()()(2,z) = adj(cm()()(0,x)*cm()()(1,y)-cm()()(0,y)*cm()()(1,x)); //z= xy-yx + Umu_v[ss]=cm; + }); +} +static void ProjectSU3(Lattice >,Nd> > &U) +{ + GridBase *grid=U.Grid(); + // Reunitarise + for(int mu=0;mu(U,mu); + Umu = ProjectOnGroup(Umu); + ProjectSU3(Umu); + PokeIndex(U,Umu,mu); + } +} + typedef SU<2> SU2; typedef SU<3> SU3; typedef SU<4> SU4; From aace3d47b993a6269d6b53ae3705aa9cf299bd16 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 19 Nov 2020 19:24:14 +0100 Subject: [PATCH 076/201] partial work in progress --- tests/core/Test_reunitarise.cc | 137 +++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 tests/core/Test_reunitarise.cc diff --git a/tests/core/Test_reunitarise.cc b/tests/core/Test_reunitarise.cc new file mode 100644 index 00000000..3e78b961 --- /dev/null +++ b/tests/core/Test_reunitarise.cc @@ -0,0 +1,137 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_quenched_update.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt({8,8,8,8}); + GridCartesian * grid = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexD::Nsimd()), + GridDefaultMpi()); + + GridCartesian * gridF = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexF::Nsimd()), + GridDefaultMpi()); + + + /////////////////////////////// + // Configuration of known size + /////////////////////////////// + LatticeColourMatrixD ident(grid); + LatticeColourMatrixD U(grid); + LatticeColourMatrixD UU(grid); + LatticeColourMatrixD tmp(grid); + LatticeColourMatrixD org(grid); + LatticeColourMatrixF UF(gridF); + + LatticeGaugeField Umu(grid); + + ident =1.0; + + // RNG set up for test + std::vector pseeds({1,2,3,4,5}); // once I caught a fish alive + std::vector sseeds({6,7,8,9,10});// then i let it go again + GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds); + + SU::HotConfiguration(pRNG,Umu); + + U = PeekIndex(Umu,0); + org=U; + + + tmp= U*adj(U) - ident ; + RealD Def1 = norm2( tmp ); + std::cout << " Defect1 "<(U,Nc-1,i); + element = element * phase; + PokeIndex(U,element,Nc-1,i); + } + UU=U; + + detU= Determinant(U) ; + std::cout << "Determinant after screw up " <(UU); + detUU= Determinant(UU); + std::cout << "Determinant ProjectSUn " < Date: Fri, 20 Nov 2020 16:48:28 +0100 Subject: [PATCH 077/201] Configurable ALLOC_ALIGN and ALLOC_CACHE --- Grid/allocator/MemoryManager.h | 2 -- configure.ac | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index aac13aee..25c5b5f5 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -34,8 +34,6 @@ NAMESPACE_BEGIN(Grid); // Move control to configure.ac and Config.h? -#define ALLOCATION_CACHE -#define GRID_ALLOC_ALIGN (2*1024*1024) #define GRID_ALLOC_SMALL_LIMIT (4096) /*Pinning pages is costly*/ diff --git a/configure.ac b/configure.ac index 5b22309d..4d16d776 100644 --- a/configure.ac +++ b/configure.ac @@ -491,6 +491,28 @@ AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS" ###### PRECISION ALWAYS DOUBLE AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] ) +######################################################### +###################### GRID ALLOCATOR ALIGNMENT ## +######################################################### +AC_ARG_ENABLE([alloc-align],[AC_HELP_STRING([--enable-alloc-align=2MB|4k], + [Alignment in bytes of GRID Allocator ])],[ac_ALLOC_ALIGN=${enable_alloc_align}],[ac_ALLOC_ALIGN=2MB]) +case ${ac_ALLOC_ALIGN} in + 4k) + AC_DEFINE([GRID_ALLOC_ALIGN],[(4096)],[GRID_ALLOC_ALIGN]);; + 2MB) + AC_DEFINE([GRID_ALLOC_ALIGN],[(2*1024*1024)],[GRID_ALLOC_ALIGN]);; + *);; +esac + +AC_ARG_ENABLE([alloc-cache],[AC_HELP_STRING([--enable-alloc-cache ], + [Cache a pool of recent "frees" to reuse])],[ac_ALLOC_CACHE=${enable_alloc_cache}],[ac_ALLOC_CACHE=yes]) +case ${ac_ALLOC_CACHE} in + yes) + AC_DEFINE([ALLOCATION_CACHE],[1],[ALLOCATION_CACHE]);; + *);; +esac + + ######################################################### ###################### set GPU device to rank in node ## ######################################################### From 86e8b9fe387a922e13d4cfe67a2dbd5554f6ed46 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 20 Nov 2020 17:07:16 +0100 Subject: [PATCH 078/201] ALLOC_ALIGN removed --- Grid/threads/Accelerator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 5f5cd5fe..6232aea8 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -361,7 +361,7 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas ////////////////////////////////////////////// // CPU Target - No accelerator just thread instead ////////////////////////////////////////////// -#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned + #if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) ) #undef GRID_SIMT From 147dc15d26da963d5c033701ad9e9211dcba50d3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 20 Nov 2020 13:13:59 -0500 Subject: [PATCH 079/201] Update --- benchmarks/Benchmark_ITT.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 5d602ce9..032535b3 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -445,7 +445,7 @@ public: // 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8 // 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2 // double flops=(1344.0*volume)/2; -#if 0 +#if 1 double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2; #else double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns + 2*Nd*Nc*Ns*2; From d4861a362ccaca6bbf300da450f46c10ea40ed29 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 23 Nov 2020 15:38:49 +0000 Subject: [PATCH 080/201] Stencil use non-UVM memory for look up table on enable-shared=no --- Grid/stencil/Stencil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 1e198972..23fc8203 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -269,7 +269,7 @@ public: std::vector > > face_table ; Vector surface_list; - Vector _entries; // Resident in managed memory + stencilVector _entries; // Resident in managed memory std::vector Packets; std::vector Mergers; std::vector MergersSHM; From 683a5e5bf556f97824fd242f76d0a81ab8dd7bd8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 23 Nov 2020 15:39:51 +0000 Subject: [PATCH 081/201] Stencil use host vector for integera table on enable-shared=no and mirror it on device --- Grid/allocator/AlignedAllocator.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 4b357523..91622789 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -173,7 +173,8 @@ template using cshiftAllocator = devAllocator; template using cshiftAllocator = std::allocator; #endif -template using Vector = std::vector >; +template using Vector = std::vector >; +template using stencilVector = std::vector >; template using commVector = std::vector >; template using cshiftVector = std::vector >; From 97e264d0ff6562008be60ee4f3ba355198c7f247 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 23 Nov 2020 15:46:11 +0000 Subject: [PATCH 082/201] Christoph's changes --- Grid/allocator/MemoryManagerCache.cc | 36 +++++++++++++++++----------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 5dd7575e..50076eba 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -1,11 +1,12 @@ #include - #ifndef GRID_UVM #warning "Using explicit device memory copies" NAMESPACE_BEGIN(Grid); +//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout); #define dprintf(...) + //////////////////////////////////////////////////////////// // For caching copies of data on device //////////////////////////////////////////////////////////// @@ -103,7 +104,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - // dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); assert(AccCache.CpuPtr!=(uint64_t)NULL); @@ -111,7 +112,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - // dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); + dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -125,7 +126,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - // dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); if(AccCache.state==AccDirty) { @@ -136,7 +137,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - // dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); + dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -149,7 +150,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); - // dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -164,7 +165,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - // dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; @@ -227,18 +228,24 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,hint); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - + if (!AccCache.AccPtr) + EvictVictims(bytes); + assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { + dprintf("ViewOpen found entry %llx %llx : %lld %lld\n", + (uint64_t)AccCache.CpuPtr, + (uint64_t)CpuPtr, + (uint64_t)AccCache.bytes, + (uint64_t)bytes); assert(AccCache.CpuPtr == CpuPtr); assert(AccCache.bytes ==bytes); } @@ -285,21 +292,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // CpuDirty + AccRead => Consistent } AccCache.accLock++; - // printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); } else if(AccCache.state==Consistent) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty else AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.accLock++; - // printf("Consistent entry into device accLock %d\n",AccCache.accLock); + dprintf("Consistent entry into device accLock %d\n",AccCache.accLock); } else if(AccCache.state==AccDirty) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty else AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.accLock++; - // printf("AccDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock); } else { assert(0); } @@ -361,13 +368,14 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,transient); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - + if (!AccCache.AccPtr) + EvictVictims(bytes); + assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache.accLock==0); // Programming error From 321f0f51b59109f9cb2b17d0e0f6a1883076be54 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 24 Nov 2020 21:46:10 -0500 Subject: [PATCH 083/201] Project to SU(N) --- Grid/qcd/action/gauge/GaugeImplTypes.h | 4 ++++ Grid/qcd/action/scalar/ScalarImpl.h | 8 ++++++++ Grid/qcd/hmc/HMC.h | 2 +- Grid/qcd/hmc/integrators/Integrator.h | 2 ++ Grid/qcd/utils/SUn.h | 5 ++--- tests/core/Test_reunitarise.cc | 19 +++++++++++++------ 6 files changed, 30 insertions(+), 10 deletions(-) diff --git a/Grid/qcd/action/gauge/GaugeImplTypes.h b/Grid/qcd/action/gauge/GaugeImplTypes.h index 9b7d5a60..55a20eca 100644 --- a/Grid/qcd/action/gauge/GaugeImplTypes.h +++ b/Grid/qcd/action/gauge/GaugeImplTypes.h @@ -154,6 +154,10 @@ public: return Hsum.real(); } + static inline void Project(Field &U) { + ProjectSUn(U); + } + static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { SU::HotConfiguration(pRNG, U); } diff --git a/Grid/qcd/action/scalar/ScalarImpl.h b/Grid/qcd/action/scalar/ScalarImpl.h index 14675b11..403ea573 100644 --- a/Grid/qcd/action/scalar/ScalarImpl.h +++ b/Grid/qcd/action/scalar/ScalarImpl.h @@ -54,6 +54,10 @@ public: static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { U = 1.0; } + + static inline void Project(Field &U) { + return; + } static void MomentumSpacePropagator(Field &out, RealD m) { @@ -234,6 +238,10 @@ public: #endif //USE_FFT_ACCELERATION } + static inline void Project(Field &U) { + return; + } + static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U); } diff --git a/Grid/qcd/hmc/HMC.h b/Grid/qcd/hmc/HMC.h index 0f933204..f168b69a 100644 --- a/Grid/qcd/hmc/HMC.h +++ b/Grid/qcd/hmc/HMC.h @@ -95,7 +95,7 @@ private: typedef typename IntegratorType::Field Field; typedef std::vector< HmcObservable * > ObsListType; - + //pass these from the resource manager GridSerialRNG &sRNG; GridParallelRNG &pRNG; diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index d5475704..70055754 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -313,6 +313,8 @@ public: std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; } + FieldImplementation::Project(U); + // and that we indeed got to the end of the trajectory assert(fabs(t_U - Params.trajL) < 1.0e-6); diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h index 69ab4ebb..675493b3 100644 --- a/Grid/qcd/utils/SUn.h +++ b/Grid/qcd/utils/SUn.h @@ -820,7 +820,6 @@ LatticeComplexD Determinant(const Lattice }} ComplexD det = EigenU.determinant(); pokeLocalSite(det,ret_v,lcoor); - std::cout << " site " < > > > &Umu) Umu = ProjectOnGroup(Umu); auto det = Determinant(Umu); - det = pow(det,-1); - + det = conjugate(det); + for(int i=0;i(Umu,N-1,i); element = element * det; diff --git a/tests/core/Test_reunitarise.cc b/tests/core/Test_reunitarise.cc index 3e78b961..9a6781f1 100644 --- a/tests/core/Test_reunitarise.cc +++ b/tests/core/Test_reunitarise.cc @@ -102,7 +102,8 @@ int main (int argc, char ** argv) LatticeComplexD detUU(grid); detU= Determinant(U) ; - std::cout << "Determinant before screw up " <(UU); + ProjectSUn(UU); detUU= Determinant(UU); - std::cout << "Determinant ProjectSUn " < Date: Wed, 2 Dec 2020 17:55:30 -0800 Subject: [PATCH 084/201] Duplicate code --- Grid/allocator/AlignedAllocator.cc | 67 ------------------------------ 1 file changed, 67 deletions(-) delete mode 100644 Grid/allocator/AlignedAllocator.cc diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc deleted file mode 100644 index 0d1707d9..00000000 --- a/Grid/allocator/AlignedAllocator.cc +++ /dev/null @@ -1,67 +0,0 @@ -#include -#include - -NAMESPACE_BEGIN(Grid); - -MemoryStats *MemoryProfiler::stats = nullptr; -bool MemoryProfiler::debug = false; - -void check_huge_pages(void *Buf,uint64_t BYTES) -{ -#ifdef __linux__ - int fd = open("/proc/self/pagemap", O_RDONLY); - assert(fd >= 0); - const int page_size = 4096; - uint64_t virt_pfn = (uint64_t)Buf / page_size; - off_t offset = sizeof(uint64_t) * virt_pfn; - uint64_t npages = (BYTES + page_size-1) / page_size; - uint64_t pagedata[npages]; - uint64_t ret = lseek(fd, offset, SEEK_SET); - assert(ret == offset); - ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); - assert(ret == sizeof(uint64_t) * npages); - int nhugepages = npages / 512; - int n4ktotal, nnothuge; - n4ktotal = 0; - nnothuge = 0; - for (int i = 0; i < nhugepages; ++i) { - uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size; - for (int j = 0; j < 512; ++j) { - uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size; - ++n4ktotal; - if (pageaddr != baseaddr + j * page_size) - ++nnothuge; - } - } - int rank = CartesianCommunicator::RankWorld(); - printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); -#endif -} - -std::string sizeString(const size_t bytes) -{ - constexpr unsigned int bufSize = 256; - const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"}; - char buf[256]; - size_t s = 0; - double count = bytes; - - while (count >= 1024 && s < 7) - { - s++; - count /= 1024; - } - if (count - floor(count) == 0.0) - { - snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]); - } - else - { - snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]); - } - - return std::string(buf); -} - -NAMESPACE_END(Grid); - From cf76741ec651c41f1d1fa38d22e0474899691e72 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Dec 2020 03:47:11 -0800 Subject: [PATCH 085/201] Intel DPCPP Gold happy now (compiles all, runs Benchmark_dwf_fp32 ) --- Grid/DisableWarnings.h | 2 ++ Grid/Makefile.am | 12 ++++++++ Grid/communicator/SharedMemory.h | 2 +- Grid/communicator/SharedMemoryMPI.cc | 2 +- Grid/communicator/SharedMemoryNone.cc | 43 ++++++++++++++++++++++++++- Grid/lattice/Lattice_basis.h | 5 ++-- benchmarks/Benchmark_gparity.cc | 4 +-- configure.ac | 18 +++++++++++ scripts/filelist | 18 +++++++++-- 9 files changed, 97 insertions(+), 9 deletions(-) diff --git a/Grid/DisableWarnings.h b/Grid/DisableWarnings.h index 8ea219fb..4bd1edd0 100644 --- a/Grid/DisableWarnings.h +++ b/Grid/DisableWarnings.h @@ -37,7 +37,9 @@ directory #endif //disables and intel compiler specific warning (in json.hpp) +#ifdef __ICC #pragma warning disable 488 +#endif #ifdef __NVCC__ //disables nvcc specific warning in json.hpp diff --git a/Grid/Makefile.am b/Grid/Makefile.am index f1fa462e..ded6d146 100644 --- a/Grid/Makefile.am +++ b/Grid/Makefile.am @@ -21,6 +21,7 @@ if BUILD_HDF5 extra_headers+=serialisation/Hdf5Type.h endif + all: version-cache Version.h version-cache: @@ -53,6 +54,17 @@ Version.h: version-cache include Make.inc include Eigen.inc +extra_sources+=$(ZWILS_FERMION_FILES) +extra_sources+=$(WILS_FERMION_FILES) +extra_sources+=$(STAG_FERMION_FILES) +if BUILD_GPARITY + extra_sources+=$(GP_FERMION_FILES) +endif +if BUILD_FERMION_REPS + extra_sources+=$(ADJ_FERMION_FILES) + extra_sources+=$(TWOIND_FERMION_FILES) +endif + lib_LIBRARIES = libGrid.a CCFILES += $(extra_sources) diff --git a/Grid/communicator/SharedMemory.h b/Grid/communicator/SharedMemory.h index 6c6e3953..f2d20a24 100644 --- a/Grid/communicator/SharedMemory.h +++ b/Grid/communicator/SharedMemory.h @@ -102,7 +102,7 @@ public: /////////////////////////////////////////////////// static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryFree(void); - static void SharedMemoryCopy(void *dest,const void *src,size_t bytes); + static void SharedMemoryCopy(void *dest,void *src,size_t bytes); static void SharedMemoryZero(void *dest,size_t bytes); }; diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 6089093b..a12418e6 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -715,7 +715,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes) bzero(dest,bytes); #endif } -void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes) +void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) { #ifdef GRID_CUDA cudaMemcpy(dest,src,bytes,cudaMemcpyDefault); diff --git a/Grid/communicator/SharedMemoryNone.cc b/Grid/communicator/SharedMemoryNone.cc index ed37ab47..35663632 100644 --- a/Grid/communicator/SharedMemoryNone.cc +++ b/Grid/communicator/SharedMemoryNone.cc @@ -29,6 +29,7 @@ Author: Peter Boyle #include NAMESPACE_BEGIN(Grid); +#define header "SharedMemoryNone: " /*Construct from an MPI communicator*/ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) @@ -55,6 +56,38 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M //////////////////////////////////////////////////////////////////////////////////////////// // Hugetlbfs mapping intended, use anonymous mmap //////////////////////////////////////////////////////////////////////////////////////////// +#if 1 +void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) +{ + std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "< Bt(Nm * max_threads); thread_region @@ -164,7 +164,8 @@ void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,in auto basis_vp=& basis_v[0]; autoView(result_v,result,AcceleratorWrite); accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ - auto B=coalescedRead(zz); + vobj zzz=Zero(); + auto B=coalescedRead(zzz); for(int k=k0; k Make.inc echo >> Make.inc echo CCFILES=$CCFILES >> Make.inc - +echo ZWILS_FERMION_FILES=$ZWILS_FERMION_FILES >> Make.inc +echo WILS_FERMION_FILES=$WILS_FERMION_FILES >> Make.inc +echo STAG_FERMION_FILES=$STAG_FERMION_FILES >> Make.inc +echo GP_FERMION_FILES=$GP_FERMION_FILES >> Make.inc +echo ADJ_FERMION_FILES=$ADJ_FERMION_FILES >> Make.inc +echo TWOIND_FERMION_FILES=$TWOIND_FERMION_FILES >> Make.inc # tests Make.inc cd $home/tests From 2ef1fa66a8afe2066b8c1ef191a608bd64bdb3bd Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Mon, 7 Dec 2020 11:53:35 -0500 Subject: [PATCH 086/201] Improved performance of G-parity kernel for GPUs by simplifying multLink implementation --- Grid/qcd/action/fermion/GparityWilsonImpl.h | 42 ++++++++------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h index 0b726db9..9dca403b 100644 --- a/Grid/qcd/action/fermion/GparityWilsonImpl.h +++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h @@ -97,42 +97,30 @@ public: Coordinate icoor; #ifdef GRID_SIMT - _Spinor tmp; - const int Nsimd =SiteDoubledGaugeField::Nsimd(); int s = acceleratorSIMTlane(Nsimd); St.iCoorFromIindex(icoor,s); int mmu = mu % Nd; - if ( SE->_around_the_world && St.parameters.twists[mmu] ) { - - int permute_lane = (sl==1) - || ((distance== 1)&&(icoor[direction]==1)) - || ((distance==-1)&&(icoor[direction]==0)); - if ( permute_lane ) { - tmp(0) = chi(1); - tmp(1) = chi(0); - } else { - tmp(0) = chi(0); - tmp(1) = chi(1); - } + auto UU0=coalescedRead(U(0)(mu)); + auto UU1=coalescedRead(U(1)(mu)); + + //Decide whether we do a G-parity flavor twist + //Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir + //It also assumes (but does not check) that abs(distance) == 1 + int permute_lane = (sl==1) + || ((distance== 1)&&(icoor[direction]==1)) + || ((distance==-1)&&(icoor[direction]==0)); - auto UU0=coalescedRead(U(0)(mu)); - auto UU1=coalescedRead(U(1)(mu)); + permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world - mult(&phi(0),&UU0,&tmp(0)); - mult(&phi(1),&UU1,&tmp(1)); + //Apply the links + int f_upper = permute_lane ? 1 : 0; + int f_lower = !f_upper; - } else { - - auto UU0=coalescedRead(U(0)(mu)); - auto UU1=coalescedRead(U(1)(mu)); - - mult(&phi(0),&UU0,&chi(0)); - mult(&phi(1),&UU1,&chi(1)); - - } + mult(&phi(0),&UU0,&chi(f_upper)); + mult(&phi(1),&UU1,&chi(f_lower)); #else typedef _Spinor vobj; From c438118fd719b5fd908dc72dcd6ff3d6db83923c Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Tue, 8 Dec 2020 14:42:11 +0100 Subject: [PATCH 087/201] Change access specifier of clover fields in order to allow deriving classes to access these --- Grid/qcd/action/fermion/WilsonCloverFermion.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/WilsonCloverFermion.h b/Grid/qcd/action/fermion/WilsonCloverFermion.h index 91ad6d6d..92af7111 100644 --- a/Grid/qcd/action/fermion/WilsonCloverFermion.h +++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h @@ -245,7 +245,7 @@ public: return out; } -private: +protected: // here fixing the 4 dimensions, make it more general? RealD csw_r; // Clover coefficient - spatial From 9aec4a3c2620dd459bac5b89fc0d661aaf50e6cd Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 10 Dec 2020 02:11:17 -0800 Subject: [PATCH 088/201] SYCL --- Grid/simd/Grid_gpu_vec.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 8b17f75a..8e55ce2f 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -38,12 +38,20 @@ Author: Peter Boyle #ifdef GRID_HIP #include #endif +#ifdef GRID_SYCL +namespace Grid { + typedef struct { uint16_t x;} half; + typedef struct { half x; half y;} half2; + typedef struct { float x; float y;} float2; + typedef struct { double x; double y;} double2; +} +#endif + namespace Grid { -#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) -typedef struct { uint16_t x;} half; -#endif + + typedef struct Half2_t { half x; half y; } Half2; #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH ) @@ -156,7 +164,7 @@ accelerator_inline float half2float(half h) f = __half2float(h); #else Grid_half hh; - hh.x = hr.x; + hh.x = h.x; f= sfw_half_to_float(hh); #endif return f; From 873519e96046acfd0844a7d07d540d989a7a6204 Mon Sep 17 00:00:00 2001 From: Michael Marshall <43034299+mmphys@users.noreply.github.com> Date: Mon, 14 Dec 2020 16:06:10 +0000 Subject: [PATCH 089/201] Enable existing conserved current code for CUDA (compiles OK for CUDA 10.1). Add option to Test_cayley_mres to load a configuration --- .../implementation/CayleyFermion5DImplementation.h | 4 ++-- tests/debug/Test_cayley_mres.cc | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index b3fbe096..f11e9c44 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -642,7 +642,7 @@ void CayleyFermion5D::ContractConservedCurrent( PropagatorField &q_in_1, Current curr_type, unsigned int mu) { -#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) +#if (!defined(GRID_HIP)) Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, @@ -826,7 +826,7 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, } #endif -#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) +#if (!defined(GRID_HIP)) int tshift = (mu == Nd-1) ? 1 : 0; //////////////////////////////////////////////// // GENERAL CAYLEY CASE diff --git a/tests/debug/Test_cayley_mres.cc b/tests/debug/Test_cayley_mres.cc index 2e56fa81..5282c756 100644 --- a/tests/debug/Test_cayley_mres.cc +++ b/tests/debug/Test_cayley_mres.cc @@ -108,8 +108,18 @@ int main (int argc, char ** argv) GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); LatticeGaugeField Umu(UGrid); - SU::ColdConfiguration(Umu); - // SU::HotConfiguration(RNG4,Umu); + if( argc > 1 && argv[1][0] != '-' ) + { + std::cout<::ColdConfiguration(Umu); + // SU::HotConfiguration(RNG4,Umu); + } RealD mass=0.3; RealD M5 =1.0; From 808f1e0e8c199204c7369fe2a033bc6041cbaa91 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Tue, 15 Dec 2020 16:33:29 +0000 Subject: [PATCH 090/201] merge develop --- tests/solver/Test_zMADWF_prec.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/solver/Test_zMADWF_prec.cc b/tests/solver/Test_zMADWF_prec.cc index d1168764..f18e1d86 100644 --- a/tests/solver/Test_zMADWF_prec.cc +++ b/tests/solver/Test_zMADWF_prec.cc @@ -52,7 +52,7 @@ struct TestParams{ bool zmobius_inner; double lambda_max; //upper bound of H_T eigenvalue range required to generate zMobius approximation - TestParams(): load_config(true), config_file("ckpoint_lat.1000"), mass(0.01), + TestParams(): load_config(false), config_file("ckpoint_lat.1000"), mass(0.01), Ls_outer(24), b_plus_c_outer(2.0), resid_outer(1e-8), Ls_inner(12), b_plus_c_inner(1.0), resid_inner(1e-8), zmobius_inner(true), lambda_max(1.42), outer_precon("Standard"), inner_precon("Standard") {} @@ -246,7 +246,7 @@ void run(const TestParams ¶ms){ typename RunParamsInner::SchurSolverType SchurSolver_inner(CG_inner); ZeroGuesser Guess; - MADWF > madwf(D_outer, D_inner, PV_outer, SchurSolver_inner, Guess, params.resid_outer, 100, &update); + MADWF > madwf(D_outer, D_inner, PV_outer, SchurSolver_inner, Guess, params.resid_outer, 10000, &update); LatticeFermionD result_MADWF(FGrid_outer); result_MADWF = Zero(); From f36d6f3923b7632e169b6740c94cc39ecc6bc8a9 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Thu, 17 Dec 2020 17:04:08 +0000 Subject: [PATCH 091/201] compiles on GPU. 3pt still wrong!!!! --- Grid/qcd/utils/A2Autils.h | 2 +- Grid/qcd/utils/BaryonUtils.h | 991 +++++++++++++++++++---------------- 2 files changed, 544 insertions(+), 449 deletions(-) diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index b63d8571..497927dd 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -1047,7 +1047,7 @@ A2Autils::ContractWWVV(std::vector &WWVV, { GridBase *grid = vs[0].Grid(); - int nd = grid->_ndimension; + //int nd = grid->_ndimension; int Nsimd = grid->Nsimd(); int N_t = WW_sd.dimensions()[0]; int N_s = WW_sd.dimensions()[1]; diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 15516b56..25c71e3a 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -44,24 +44,24 @@ public: typedef typename ComplexField::vector_object vobj; typedef Lattice> SpinMatrixField; - typedef typename SpinMatrixField::vector_object sobj; + //typedef typename SpinMatrixField::vector_object sobj; - static const int epsilon[6][3] ; - static const Real epsilon_sgn[6]; + //static const int epsilon[6][3] ; + //static const Real epsilon_sgn[6]; private: - template + template accelerator_inline static void BaryonSite(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - const bool * wick_contractions, - robj &result); - template + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + const bool * wick_contractions, + robj &result); + template accelerator_inline static void BaryonSiteMatrix(const mobj &D1, const mobj &D2, const mobj &D3, @@ -76,15 +76,15 @@ public: std::string qf, bool* wick_contractions); static void ContractBaryons(const PropagatorField &q1_left, - const PropagatorField &q2_left, - const PropagatorField &q3_left, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const bool* wick_contractions, - const int parity, - ComplexField &baryon_corr); + const PropagatorField &q2_left, + const PropagatorField &q3_left, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + const int parity, + ComplexField &baryon_corr); static void ContractBaryonsMatrix(const PropagatorField &q1_left, const PropagatorField &q2_left, const PropagatorField &q3_left, @@ -96,16 +96,16 @@ public: SpinMatrixField &baryon_corr); template static void ContractBaryonsSliced(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const bool* wick_contractions, - const int parity, - const int nt, - robj &result); + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + const int parity, + const int nt, + robj &result); template static void ContractBaryonsSlicedMatrix(const mobj &D1, const mobj &D2, @@ -118,11 +118,11 @@ public: const int nt, robj &result); private: - template + template accelerator_inline static void BaryonGamma3ptGroup1Site( const mobj &Dq1_ti, const mobj2 &Dq2_spec, - const mobj2 &Dq3_spec, + // const mobj2 &Dq3_spec, const mobj &Dq4_tf, const Gamma GammaJ, const Gamma GammaBi, @@ -130,11 +130,11 @@ public: int wick_contraction, robj &result); - template + template accelerator_inline static void BaryonGamma3ptGroup2Site( const mobj2 &Dq1_spec, const mobj &Dq2_ti, - const mobj2 &Dq3_spec, + //const mobj2 &Dq3_spec, const mobj &Dq4_tf, const Gamma GammaJ, const Gamma GammaBi, @@ -142,10 +142,10 @@ public: int wick_contraction, robj &result); - template + template accelerator_inline static void BaryonGamma3ptGroup3Site( const mobj2 &Dq1_spec, - const mobj2 &Dq2_spec, + //const mobj2 &Dq2_spec, const mobj &Dq3_ti, const mobj &Dq4_tf, const Gamma GammaJ, @@ -167,86 +167,78 @@ public: const Gamma GammaBf, SpinMatrixField &stn_corr); private: - template + template accelerator_inline static void SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result); - template + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); + template accelerator_inline static void SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, - const mobj &Du_tf, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result); + const mobj &Du_tf, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); - template + template accelerator_inline static void SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result); - template + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); + template accelerator_inline static void SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, - const mobj &Du_tf, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result); + const mobj &Du_tf, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); public: template static void SigmaToNucleonEye(const PropagatorField &qq_loop, - const mobj &Du_spec, - const PropagatorField &qd_tf, - const PropagatorField &qs_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - const std::string op, - SpinMatrixField &stn_corr); + const mobj &Du_spec, + const PropagatorField &qd_tf, + const PropagatorField &qs_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + const std::string op, + SpinMatrixField &stn_corr); template static void SigmaToNucleonNonEye(const PropagatorField &qq_ti, - const PropagatorField &qq_tf, - const mobj &Du_spec, - const PropagatorField &qd_tf, - const PropagatorField &qs_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - const std::string op, - SpinMatrixField &stn_corr); + const PropagatorField &qq_tf, + const mobj &Du_spec, + const PropagatorField &qd_tf, + const PropagatorField &qs_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + const std::string op, + SpinMatrixField &stn_corr); }; - +/* template const int BaryonUtils::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; -/*template -const Complex BaryonUtils::epsilon_sgn[6] = {Complex(1), - Complex(1), - Complex(1), - Complex(-1), - Complex(-1), - Complex(-1)}; -*/ template const Real BaryonUtils::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.}; - +*/ //This is the old version template -template +template accelerator_inline void BaryonUtils::BaryonSite(const mobj &D1, const mobj &D2, const mobj &D3, @@ -274,16 +266,20 @@ void BaryonUtils::BaryonSite(const mobj &D1, auto GBf_D3 = GammaB_f * D3; auto GAf_D3 = GammaA_f * D3; - for (int ie_f=0; ie_f < 6 ; ie_f++){ - int a_f = epsilon[ie_f][0]; //a - int b_f = epsilon[ie_f][1]; //b - int c_f = epsilon[ie_f][2]; //c - for (int ie_i=0; ie_i < 6 ; ie_i++){ - int a_i = epsilon[ie_i][0]; //a' - int b_i = epsilon[ie_i][1]; //b' - int c_i = epsilon[ie_i][2]; //c' + Real ee; - Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = (ie_f < 3 ? ie_f : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a + int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b + int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c + int eSgn_f = (ie_f < 3 ? 1 : -1); + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); + + ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; //This is the \delta_{456}^{123} part if (wick_contraction[0]){ for (int rho=0; rho::BaryonSite(const mobj &D1, //New version without parity projection or trace template -template +template accelerator_inline void BaryonUtils::BaryonSiteMatrix(const mobj &D1, const mobj &D2, const mobj &D3, @@ -384,16 +380,21 @@ void BaryonUtils::BaryonSiteMatrix(const mobj &D1, auto GBf_D3 = GammaB_f * D3; auto GAf_D3 = GammaA_f * D3; - for (int ie_f=0; ie_f < 6 ; ie_f++){ - int a_f = epsilon[ie_f][0]; //a - int b_f = epsilon[ie_f][1]; //b - int c_f = epsilon[ie_f][2]; //c - for (int ie_i=0; ie_i < 6 ; ie_i++){ - int a_i = epsilon[ie_i][0]; //a' - int b_i = epsilon[ie_i][1]; //b' - int c_i = epsilon[ie_i][2]; //c' - Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + Real ee; + + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = (ie_f < 3 ? ie_f : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a + int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b + int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c + int eSgn_f = (ie_f < 3 ? 1 : -1); + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); + + ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; //This is the \delta_{456}^{123} part if (wick_contraction[0]){ for (int rho_i=0; rho_i::WickContractions(std::string qi, std::string qf, bool* * Wick_Contractions function above */ template void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, - const PropagatorField &q2_left, - const PropagatorField &q3_left, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const bool* wick_contractions, - const int parity, - ComplexField &baryon_corr) + const PropagatorField &q2_left, + const PropagatorField &q3_left, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + const int parity, + ComplexField &baryon_corr) { assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); @@ -519,10 +520,10 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, GridBase *grid = q1_left.Grid(); - autoView(vbaryon_corr, baryon_corr,CpuWrite); - autoView( v1 , q1_left, CpuRead); - autoView( v2 , q2_left, CpuRead); - autoView( v3 , q3_left, CpuRead); + autoView(vbaryon_corr , baryon_corr , AcceleratorWrite); + autoView( v1 , q1_left , AcceleratorRead); + autoView( v2 , q2_left , AcceleratorRead); + autoView( v3 , q3_left , AcceleratorRead); Real bytes =0.; bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real)); @@ -538,12 +539,13 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, t =-usecond(); accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto D1 = v1[ss]; - auto D2 = v2[ss]; - auto D3 = v3[ss]; - vobj result=Zero(); + auto D1 = v1(ss); + auto D2 = v2(ss); + auto D3 = v3(ss); + typedef decltype(coalescedRead(vbaryon_corr[0])) cVec; + cVec result=Zero(); BaryonSite(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result); - vbaryon_corr[ss] = result; + coalescedWrite(vbaryon_corr[ss],result); } );//end loop over lattice sites t += usecond(); @@ -567,38 +569,21 @@ void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); GridBase *grid = q1_left.Grid(); - - autoView(vbaryon_corr, baryon_corr,CpuWrite); - autoView( v1 , q1_left, CpuRead); - autoView( v2 , q2_left, CpuRead); - autoView( v3 , q3_left, CpuRead); - // Real bytes =0.; - // bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real)); - // for (int ie=0; ie < 6 ; ie++){ - // if(ie==0 or ie==3){ - // bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie]; - // } - // else{ - // bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie]; - // } - // } - // Real t=0.; - // t =-usecond(); + autoView(vbaryon_corr , baryon_corr , AcceleratorWrite); + autoView( v1 , q1_left , AcceleratorRead); + autoView( v2 , q2_left , AcceleratorRead); + autoView( v3 , q3_left , AcceleratorRead); accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto D1 = v1[ss]; - auto D2 = v2[ss]; - auto D3 = v3[ss]; - sobj result=Zero(); + auto D1 = v1(ss); + auto D2 = v2(ss); + auto D3 = v3(ss); + typedef decltype(coalescedRead(vbaryon_corr[0])) spinor; + spinor result=Zero(); BaryonSiteMatrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result); - vbaryon_corr[ss] = result; + coalescedWrite(vbaryon_corr[ss],result); } );//end loop over lattice sites - - // t += usecond(); - - // std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; - } /* The array wick_contractions must be of length 6. The order * @@ -609,16 +594,16 @@ void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, template template void BaryonUtils::ContractBaryonsSliced(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const bool* wick_contractions, - const int parity, - const int nt, - robj &result) + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + const int parity, + const int nt, + robj &result) { assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); @@ -664,11 +649,11 @@ void BaryonUtils::ContractBaryonsSlicedMatrix(const mobj &D1, * Dq3_spec is a quark line from t_i to t_f * Dq4_tf is a quark line from t_f to t_J */ template -template +template accelerator_inline void BaryonUtils::BaryonGamma3ptGroup1Site( const mobj &Dq1_ti, const mobj2 &Dq2_spec, - const mobj2 &Dq3_spec, + // const mobj2 &Dq3_spec, const mobj &Dq4_tf, const Gamma GammaJ, const Gamma GammaBi, @@ -678,41 +663,47 @@ void BaryonUtils::BaryonGamma3ptGroup1Site( { Gamma g5(Gamma::Algebra::Gamma5); - auto adjD4_g_D1 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq1_ti; +// auto adjD4_g_D1 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq1_ti; + auto adjD4 = g5 * adj(Dq4_tf) * g5 ; + auto adjD4_g_D1 = adjD4 * GammaJ * Dq1_ti; auto Gf_adjD4_g_D1 = GammaBf * adjD4_g_D1; auto D2_Gi = Dq2_spec * GammaBi; auto Gf_D2_Gi = GammaBf * D2_Gi; - auto Gf_D3 = GammaBf * Dq3_spec; - int a_f, b_f, c_f; - int a_i, b_i, c_i; +// auto Gf_D3 = GammaBf * Dq3_spec; // including a second mobj2 parameter leads to compilation error + auto Gf_D3 = GammaBf * Dq2_spec; //WRONG!!!!! - Real ee; - for (int ie_f=0; ie_f < 6 ; ie_f++){ - a_f = epsilon[ie_f][0]; //a - b_f = epsilon[ie_f][1]; //b - c_f = epsilon[ie_f][2]; //c - for (int ie_i=0; ie_i < 6 ; ie_i++){ - a_i = epsilon[ie_i][0]; //a' - b_i = epsilon[ie_i][1]; //b' - c_i = epsilon[ie_i][2]; //c' + Real ee; - ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = (ie_f < 3 ? ie_f : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a + int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b + int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c + int eSgn_f = (ie_f < 3 ? 1 : -1); + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); + + ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; for (int alpha_f=0; alpha_f::BaryonGamma3ptGroup1Site( * Dq3_spec is a quark line from t_i to t_f * Dq4_tf is a quark line from t_f to t_J */ template -template +template accelerator_inline void BaryonUtils::BaryonGamma3ptGroup2Site( const mobj2 &Dq1_spec, const mobj &Dq2_ti, - const mobj2 &Dq3_spec, + // const mobj2 &Dq3_spec, const mobj &Dq4_tf, const Gamma GammaJ, const Gamma GammaBi, @@ -773,37 +764,40 @@ void BaryonUtils::BaryonGamma3ptGroup2Site( auto adjD4_g_D2_Gi = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq2_ti * GammaBi; auto Gf_adjD4_g_D2_Gi = GammaBf * adjD4_g_D2_Gi; auto Gf_D1 = GammaBf * Dq1_spec; - auto Gf_D3 = GammaBf * Dq3_spec; + //auto Gf_D3 = GammaBf * Dq3_spec; + auto Gf_D3 = GammaBf * Dq1_spec; // WRONG!!!!! - int a_f, b_f, c_f; - int a_i, b_i, c_i; - Real ee; + Real ee; - for (int ie_f=0; ie_f < 6 ; ie_f++){ - a_f = epsilon[ie_f][0]; //a - b_f = epsilon[ie_f][1]; //b - c_f = epsilon[ie_f][2]; //c - for (int ie_i=0; ie_i < 6 ; ie_i++){ - a_i = epsilon[ie_i][0]; //a' - b_i = epsilon[ie_i][1]; //b' - c_i = epsilon[ie_i][2]; //c' + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = (ie_f < 3 ? ie_f : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a + int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b + int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c + int eSgn_f = (ie_f < 3 ? 1 : -1); + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); - ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; for (int alpha_f=0; alpha_f::BaryonGamma3ptGroup2Site( * Dq3_ti is a quark line from t_i to t_J * Dq4_tf is a quark line from t_f to t_J */ template -template +template accelerator_inline void BaryonUtils::BaryonGamma3ptGroup3Site( const mobj2 &Dq1_spec, - const mobj2 &Dq2_spec, + // const mobj2 &Dq2_spec, const mobj &Dq3_ti, const mobj &Dq4_tf, const Gamma GammaJ, @@ -863,24 +857,25 @@ void BaryonUtils::BaryonGamma3ptGroup3Site( auto adjD4_g_D3 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq3_ti; auto Gf_adjD4_g_D3 = GammaBf * adjD4_g_D3; auto Gf_D1 = GammaBf * Dq1_spec; - auto D2_Gi = Dq2_spec * GammaBi; + //auto D2_Gi = Dq2_spec * GammaBi; + auto D2_Gi = Dq1_spec * GammaBi; //WRONG!!!!!!!!!!!!!!!!! auto Gf_D2_Gi = GammaBf * D2_Gi; - int a_f, b_f, c_f; - int a_i, b_i, c_i; - Real ee; + Real ee; - for (int ie_f=0; ie_f < 6 ; ie_f++){ - a_f = epsilon[ie_f][0]; //a - b_f = epsilon[ie_f][1]; //b - c_f = epsilon[ie_f][2]; //c - for (int ie_i=0; ie_i < 6 ; ie_i++){ - a_i = epsilon[ie_i][0]; //a' - b_i = epsilon[ie_i][1]; //b' - c_i = epsilon[ie_i][2]; //c' + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = (ie_f < 3 ? ie_f : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a + int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b + int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c + int eSgn_f = (ie_f < 3 ? 1 : -1); + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); - ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; for (int alpha_f=0; alpha_f::BaryonGamma3pt( const Gamma GammaBf, SpinMatrixField &stn_corr) { + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + GridBase *grid = q_tf.Grid(); - autoView( vcorr, stn_corr, CpuWrite); - autoView( vq_ti , q_ti, CpuRead); - autoView( vq_tf , q_tf, CpuRead); + // autoView( vcorr, stn_corr, CpuWrite); + // autoView( vq_ti , q_ti, CpuRead); + // autoView( vq_tf , q_tf, CpuRead); + + // if (group == 1) { + // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + // auto Dq_ti = vq_ti[ss]; + // auto Dq_tf = vq_tf[ss]; + // sobj result=Zero(); + // BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + // vcorr[ss] += result; + // });//end loop over lattice sites + // } else if (group == 2) { + // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + // auto Dq_ti = vq_ti[ss]; + // auto Dq_tf = vq_tf[ss]; + // sobj result=Zero(); + // BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + // vcorr[ss] += result; + // });//end loop over lattice sites + // } else if (group == 3) { + // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + // auto Dq_ti = vq_ti[ss]; + // auto Dq_tf = vq_tf[ss]; + // sobj result=Zero(); + // BaryonGamma3ptGroup3Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + + // vcorr[ss] += result; + // });//end loop over lattice sites + // } + + autoView( vcorr , stn_corr , AcceleratorWrite); + autoView( vq_ti , q_ti , AcceleratorRead); + autoView( vq_tf , q_tf , AcceleratorRead); if (group == 1) { accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_ti = vq_ti[ss]; - auto Dq_tf = vq_tf[ss]; - sobj result=Zero(); - BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - vcorr[ss] += result; + auto Dq_ti = vq_ti(ss); + auto Dq_tf = vq_tf(ss); + //sobj result=Zero(); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + //BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); //WRONG + // vcorr[ss] += result; + coalescedWrite(vcorr[ss],result); });//end loop over lattice sites + } else if (group == 2) { accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_ti = vq_ti[ss]; - auto Dq_tf = vq_tf[ss]; - sobj result=Zero(); - BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - vcorr[ss] += result; + auto Dq_ti = vq_ti(ss); + auto Dq_tf = vq_tf(ss); + //sobj result=Zero(); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + // BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); //WRONG + // vcorr[ss] += result; + coalescedWrite(vcorr[ss],result); });//end loop over lattice sites } else if (group == 3) { accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_ti = vq_ti[ss]; - auto Dq_tf = vq_tf[ss]; - sobj result=Zero(); - BaryonGamma3ptGroup3Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - - vcorr[ss] += result; + auto Dq_ti = vq_ti(ss); + auto Dq_tf = vq_tf(ss); + //sobj result=Zero(); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + //BaryonGamma3ptGroup3Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + BaryonGamma3ptGroup3Site(Dq_spec1,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); //WRONG + // vcorr[ss] += result; + coalescedWrite(vcorr[ss],result); });//end loop over lattice sites } + } /*********************************************************************** * End of BaryonGamma3pt-function code. * - * * + * * * The following code is for Sigma -> N rare hypeon decays * **********************************************************************/ @@ -997,49 +1039,60 @@ void BaryonUtils::BaryonGamma3pt( * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ template -template +template accelerator_inline void BaryonUtils::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result) + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result) { Gamma g5(Gamma::Algebra::Gamma5); - auto DuG = Du_spec * GammaB_nucl; - // Gamma^B * Ds * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) - auto GDsGDd = GammaB_sigma * Ds_ti * Gamma_H * g5 * adj(Dd_tf) * g5; - // Dq_loop * \gamma_\mu^L - auto DqG = Dq_loop * Gamma_H; + //auto Gn_adjDd_GH_Ds = GammaB_nucl * g5 * adj(Dd_tf) * g5 * Gamma_H * Ds_ti; + auto adjDd_GH_Ds = g5 * adj(Dd_tf) * g5 * Gamma_H * Ds_ti; + auto Gn_adjDd_GH_Ds = GammaB_nucl * adjDd_GH_Ds; + auto Du_Gs = Du_spec * GammaB_sigma; + auto Dq_GH = Dq_loop * Gamma_H; + auto Tr_Dq_GH = trace(Dq_GH)()()(); + + Real ee; for (int ie_n=0; ie_n < 6 ; ie_n++){ - int a_n = epsilon[ie_n][0]; //a - int b_n = epsilon[ie_n][1]; //b - int c_n = epsilon[ie_n][2]; //c - for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = epsilon[ie_s][0]; //a' - int b_s = epsilon[ie_s][1]; //b' - int c_s = epsilon[ie_s][2]; //c' - for (int alpha_s=0; alpha_s::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ template -template +template accelerator_inline void BaryonUtils::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, - const mobj &Du_tf, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result) + const mobj &Du_tf, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result) { Gamma g5(Gamma::Algebra::Gamma5); - auto DuG = Du_spec * GammaB_nucl; - auto adjDu = g5 * adj(Du_tf) * g5; - auto adjDuG = adjDu * GammaB_nucl; - // Gamma^B * Ds * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) - auto GDsGDd = GammaB_sigma * Ds_ti * Gamma_H * g5 * adj(Dd_tf) * g5; - // Dq_loop * \gamma_\mu^L - auto DuGH = Du_ti * Gamma_H; + auto Du_Gs = Du_spec * GammaB_sigma; + //auto Gn_adjDd_GH_Ds = GammaB_nucl * g5 * adj(Dd_tf) * g5 * Gamma_H * Ds_ti; + auto adjDd_GH_Ds = g5 * adj(Dd_tf) * g5 * Gamma_H * Ds_ti; + auto Gn_adjDd_GH_Ds = GammaB_nucl * adjDd_GH_Ds; + auto adjDu_GH_Du = g5 * adj(Du_tf) * g5 * Gamma_H * Du_ti; + auto adjDu_GH_Du_Gs = adjDu_GH_Du * GammaB_sigma; + + Real ee; for (int ie_n=0; ie_n < 6 ; ie_n++){ - int a_n = epsilon[ie_n][0]; //a - int b_n = epsilon[ie_n][1]; //b - int c_n = epsilon[ie_n][2]; //c - for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = epsilon[ie_s][0]; //a' - int b_s = epsilon[ie_s][1]; //b' - int c_s = epsilon[ie_s][2]; //c' - for (int alpha_s=0; alpha_s::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ template -template +template accelerator_inline void BaryonUtils::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result) + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result) { Gamma g5(Gamma::Algebra::Gamma5); - auto DuG = Du_spec * GammaB_nucl; - // Gamma^B * Ds * \gamma_\mu^L - auto GDsG = GammaB_sigma * Ds_ti * Gamma_H; - // Dq_loop * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) - auto DqGDd = Dq_loop * Gamma_H * g5 * adj(Dd_tf) * g5; + //auto Gn_adjDd_GH_Duloop_GH_Ds = GammaB_nucl * g5 * adj(Dd_tf) * g5 * Gamma_H * Dq_loop * Gamma_H * Ds_ti; + auto adjDd_GH_Duloop_GH_Ds = g5 * adj(Dd_tf) * g5 * Gamma_H * Dq_loop * Gamma_H * Ds_ti; + auto Gn_adjDd_GH_Duloop_GH_Ds = GammaB_nucl * adjDd_GH_Duloop_GH_Ds; + auto Du_Gs = Du_spec * GammaB_sigma; + + Real ee; for (int ie_n=0; ie_n < 6 ; ie_n++){ - int a_n = epsilon[ie_n][0]; //a - int b_n = epsilon[ie_n][1]; //b - int c_n = epsilon[ie_n][2]; //c - for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = epsilon[ie_s][0]; //a' - int b_s = epsilon[ie_s][1]; //b' - int c_s = epsilon[ie_s][2]; //c' - for (int alpha_s=0; alpha_s::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ template -template +template accelerator_inline void BaryonUtils::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, - const mobj &Du_tf, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result) + const mobj &Du_tf, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result) { Gamma g5(Gamma::Algebra::Gamma5); - auto DuG = Du_spec * GammaB_nucl; - auto adjDu = g5 * adj(Du_tf) * g5; - auto adjDuG = adjDu * GammaB_nucl; - // Gamma^B * Ds * \gamma_\mu^L - auto GDsG = GammaB_sigma * Ds_ti * Gamma_H; - // Du * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) - auto DuGDd = Du_ti * Gamma_H * g5 * adj(Dd_tf) * g5; + auto Du_Gs = Du_spec * GammaB_sigma; + auto adjDu_GH_Ds = g5 * adj(Du_tf) * g5 * Gamma_H * Ds_ti; + //auto Gn_adjDd_GH_Du = GammaB_nucl * g5 * adj(Dd_tf) * g5 * Gamma_H * Du_ti; + auto adjDd_GH_Du = g5 * adj(Dd_tf) * g5 * Gamma_H * Du_ti; + auto Gn_adjDd_GH_Du = GammaB_nucl * adjDd_GH_Du; // for some reason I needed to split this into two lines to avoid the compilation error 'error: identifier "Grid::Gamma::mul" is undefined in device code' + + auto Gn_adjDd_GH_Du_Gs = Gn_adjDd_GH_Du * GammaB_sigma; + + Real ee; for (int ie_n=0; ie_n < 6 ; ie_n++){ - int a_n = epsilon[ie_n][0]; //a - int b_n = epsilon[ie_n][1]; //b - int c_n = epsilon[ie_n][2]; //c - for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = epsilon[ie_s][0]; //a' - int b_s = epsilon[ie_s][1]; //b' - int c_s = epsilon[ie_s][2]; //c' - for (int alpha_s=0; alpha_s template void BaryonUtils::SigmaToNucleonEye(const PropagatorField &qq_loop, - const mobj &Du_spec, - const PropagatorField &qd_tf, - const PropagatorField &qs_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - const std::string op, - SpinMatrixField &stn_corr) + const mobj &Du_spec, + const PropagatorField &qd_tf, + const PropagatorField &qs_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + const std::string op, + SpinMatrixField &stn_corr) { assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); @@ -1229,39 +1316,43 @@ void BaryonUtils::SigmaToNucleonEye(const PropagatorField &qq_loop, GridBase *grid = qs_ti.Grid(); - autoView( vcorr, stn_corr, CpuWrite); - autoView( vq_loop , qq_loop, CpuRead); - autoView( vd_tf , qd_tf, CpuRead); - autoView( vs_ti , qs_ti, CpuRead); + autoView( vcorr , stn_corr , AcceleratorWrite); + autoView( vq_loop , qq_loop , AcceleratorRead); + autoView( vd_tf , qd_tf , AcceleratorRead); + autoView( vs_ti , qs_ti , AcceleratorRead); + + bool doQ1 = (op == "Q1"); + bool doQ2 = (op == "Q2"); accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_loop = vq_loop[ss]; - auto Dd_tf = vd_tf[ss]; - auto Ds_ti = vs_ti[ss]; - sobj result=Zero(); - if(op == "Q1"){ + auto Dq_loop = vq_loop(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + if(doQ1){ SigmaToNucleonQ1EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); - } else if(op == "Q2"){ + } else if(doQ2){ SigmaToNucleonQ2EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else { assert(0 && "Weak Operator not correctly specified"); } - vcorr[ss] = result; - } );//end loop over lattice sites + coalescedWrite(vcorr[ss],result); + });//end loop over lattice sites } template template void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, - const PropagatorField &qq_tf, - const mobj &Du_spec, - const PropagatorField &qd_tf, - const PropagatorField &qs_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - const std::string op, - SpinMatrixField &stn_corr) + const PropagatorField &qq_tf, + const mobj &Du_spec, + const PropagatorField &qd_tf, + const PropagatorField &qs_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + const std::string op, + SpinMatrixField &stn_corr) { assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); @@ -1269,27 +1360,31 @@ void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, GridBase *grid = qs_ti.Grid(); - autoView( vcorr , stn_corr, CpuWrite); - autoView( vq_ti , qq_ti, CpuRead); - autoView( vq_tf , qq_tf, CpuRead); - autoView( vd_tf , qd_tf, CpuRead); - autoView( vs_ti , qs_ti, CpuRead); - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - thread_for(ss,grid->oSites(),{ - auto Dq_ti = vq_ti[ss]; - auto Dq_tf = vq_tf[ss]; - auto Dd_tf = vd_tf[ss]; - auto Ds_ti = vs_ti[ss]; - sobj result=Zero(); - if(op == "Q1"){ + autoView( vcorr , stn_corr , AcceleratorWrite ); + autoView( vq_ti , qq_ti , AcceleratorRead ); + autoView( vq_tf , qq_tf , AcceleratorRead ); + autoView( vd_tf , qd_tf , AcceleratorRead ); + autoView( vs_ti , qs_ti , AcceleratorRead ); + + bool doQ1 = (op == "Q1"); + bool doQ2 = (op == "Q2"); + + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti(ss); + auto Dq_tf = vq_tf(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + if(doQ1){ SigmaToNucleonQ1NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); - } else if(op == "Q2"){ + } else if(doQ2){ SigmaToNucleonQ2NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else { assert(0 && "Weak Operator not correctly specified"); } - vcorr[ss] = result; - } );//end loop over lattice sites + coalescedWrite(vcorr[ss],result); + });//end loop over lattice sites } NAMESPACE_END(Grid); From 4dd9e39e0d465e7cad3aef001dc0edf5e65b0ea6 Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 00:54:31 +0100 Subject: [PATCH 092/201] up to +36% performance gain for dslash/dwf on QPACE 4 using GCC 10.1.1 --- .../implementation/WilsonKernelsAsmA64FX.h | 268 +- .../WilsonKernelsAsmBodyA64FX.h | 105 +- Grid/simd/Fujitsu_A64FX_asm_double.h | 148 +- Grid/simd/Fujitsu_A64FX_asm_single.h | 148 +- Grid/simd/Fujitsu_A64FX_intrin_double.h | 160 +- Grid/simd/Fujitsu_A64FX_intrin_single.h | 160 +- Grid/simd/Fujitsu_A64FX_undef.h | 1 + Grid/simd/gridverter.py | 2377 ----------------- 8 files changed, 447 insertions(+), 2920 deletions(-) delete mode 100755 Grid/simd/gridverter.py diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index 2e587dfa..ffec05a0 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -38,9 +38,6 @@ Author: Nils Meyer Regensburg University // undefine everything related to kernels #include -// enable A64FX body -#define WILSONKERNELSASMBODYA64FX -//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") /////////////////////////////////////////////////////////// // If we are A64FX specialise the single precision routine @@ -63,119 +60,89 @@ Author: Nils Meyer Regensburg University #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + ///////////////////////////////////////////////////////////////// @@ -185,119 +152,89 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + // undefine @@ -330,119 +267,89 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, double @@ -451,124 +358,93 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + // undefs -#undef WILSONKERNELSASMBODYA64FX #include #endif //A64FXASM diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index 406e5c25..83588a7d 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -25,6 +25,11 @@ Author: Nils Meyer Regensburg University See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ + +// GCC 10 messes up SVE instruction scheduling using -O3 only, +// using -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders +// performance is better than armclang 20.2 + #ifdef KERNEL_DAG #define DIR0_PROJ XP_PROJ #define DIR1_PROJ YP_PROJ @@ -97,7 +102,7 @@ Author: Nils Meyer Regensburg University PROJ; \ MAYBEPERM(PERMUTE_DIR,perm); \ } else { \ - LOAD_CHI(base); \ + LOAD_CHI(base); \ } \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ MULT_2SPIN_1(Dir); \ @@ -110,6 +115,15 @@ Author: Nils Meyer Regensburg University } \ RECON; \ +/* +NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty + though I expected that it would improve on performance + + if (s == 0) { \ + if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ + } \ +*/ + #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ PREFETCH1_CHIMU(base); \ @@ -126,73 +140,63 @@ Author: Nils Meyer Regensburg University #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ basep = st.GetPFInfo(nent,plocal); nent++; \ - if ( local ) { \ - LOAD_CHIMU(base); \ - LOAD_TABLE(PERMUTE_DIR); \ - PROJ; \ - MAYBEPERM(PERMUTE_DIR,perm); \ - }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ - base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ - if ( local || st.same_node[Dir] ) { \ - MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ - MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ - RECON; \ - PREFETCH_CHIMU_L2(basep); \ - } else { PREFETCH_CHIMU(base); } \ + if ( local ) { \ + LOAD_CHIMU(base); \ + LOAD_TABLE(PERMUTE_DIR); \ + PROJ; \ + MAYBEPERM(PERMUTE_DIR,perm); \ + }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ + if ( local || st.same_node[Dir] ) { \ + MULT_2SPIN_1(Dir); \ + MULT_2SPIN_2; \ + RECON; \ + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ + PREFETCH_CHIMU_L2(basep); \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ PREFETCH1_CHIMU(base); \ + { ZERO_PSI; } \ ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); #endif + //////////////////////////////////////////////////////////////////////////////// // Post comms kernel //////////////////////////////////////////////////////////////////////////////// #ifdef EXTERIOR - #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ - if((!local)&&(!st.same_node[Dir]) ) { \ - LOAD_CHI(base); \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ - RECON; \ - nmu++; \ + RECON; \ + nmu++; \ } -#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - nmu=0; \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\ - if((!local)&&(!st.same_node[Dir]) ) { \ - LOAD_CHI(base); \ +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + nmu=0; \ + { ZERO_PSI;} \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ - RECON; \ - nmu++; \ + RECON; \ + nmu++; \ } #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} #endif + + { int nmu; int local,perm, ptype; @@ -209,7 +213,6 @@ Author: Nils Meyer Regensburg University int ssn=ssU+1; if(ssn>=nmax) ssn=0; // int sUn=lo.Reorder(ssn); int sUn=ssn; - LOCK_GAUGE(0); #else int sU =ssU; int ssn=ssU+1; if(ssn>=nmax) ssn=0; @@ -295,6 +298,11 @@ Author: Nils Meyer Regensburg University std::cout << "----------------------------------------------------" << std::endl; #endif + // DC ZVA test + // { uint64_t basestore = (uint64_t)&out[ss]; + // PREFETCH_RESULT_L2_STORE(basestore); } + + ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON); #ifdef SHOW @@ -308,6 +316,11 @@ Author: Nils Meyer Regensburg University std::cout << "----------------------------------------------------" << std::endl; #endif + // DC ZVA test + //{ uint64_t basestore = (uint64_t)&out[ss]; + // PREFETCH_RESULT_L2_STORE(basestore); } + + ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON); #ifdef SHOW @@ -321,6 +334,11 @@ Author: Nils Meyer Regensburg University std::cout << "----------------------------------------------------" << std::endl; #endif + // DC ZVA test + //{ uint64_t basestore = (uint64_t)&out[ss]; + // PREFETCH_RESULT_L2_STORE(basestore); + //} + ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON); #ifdef SHOW @@ -341,6 +359,7 @@ Author: Nils Meyer Regensburg University base = (uint64_t) &out[ss]; basep= st.GetPFInfo(nent,plocal); ent++; basep = (uint64_t) &out[ssn]; + //PREFETCH_RESULT_L1_STORE(base); RESULT(base,basep); #ifdef SHOW diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h index 76c556d7..bbc4efe7 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ b/Grid/simd/Fujitsu_A64FX_asm_double.h @@ -38,10 +38,11 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXd -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ZERO_PSI ZERO_PSI_A64FXd #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJ XP_PROJ_A64FXd #define YP_PROJ YP_PROJ_A64FXd @@ -70,11 +71,18 @@ Author: Nils Meyer #define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } // DECLARATIONS #define DECLARATIONS_A64FXd \ + uint64_t baseU; \ const uint64_t lut[4][8] = { \ {4, 5, 6, 7, 0, 1, 2, 3}, \ {2, 3, 0, 1, 6, 7, 4, 5}, \ {1, 0, 3, 2, 5, 4, 7, 6}, \ {0, 1, 2, 4, 5, 6, 7, 8} };\ +asm ( \ + "ptrue p5.d \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ asm ( \ "fmov z31.d , 0 \n\t" \ : \ @@ -130,7 +138,7 @@ asm ( \ // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ asm ( \ "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ @@ -149,7 +157,7 @@ asm ( \ // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ @@ -163,12 +171,12 @@ asm ( \ #define LOAD_CHI_A64FXd(base) \ { \ asm ( \ - "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -178,19 +186,18 @@ asm ( \ #define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ { \ asm ( \ - "ptrue p5.d \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -201,19 +208,18 @@ asm ( \ { \ const SiteSpinor & ref(in[offset]); \ asm ( \ - "ptrue p5.d \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -224,19 +230,18 @@ asm ( \ { \ const SiteSpinor & ref(in[offset]); \ asm ( \ - "ptrue p5.d \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -293,17 +298,16 @@ asm ( \ ); // LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +#define LOAD_GAUGE(A) \ { \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ - "ptrue p5.d \n\t" \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -312,14 +316,14 @@ asm ( \ // MULT_2SPIN #define MULT_2SPIN_1_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ "movprfx z18.d, p5/m, z31.d \n\t" \ "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ "movprfx z21.d, p5/m, z31.d \n\t" \ @@ -338,9 +342,9 @@ asm ( \ "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ - "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -560,7 +564,6 @@ asm ( \ #define TM_PROJ_A64FXd \ { \ asm ( \ - "ptrue p5.d \n\t" \ "fsub z12.d, p5/m, z12.d, z18.d \n\t" \ "fsub z13.d, p5/m, z13.d, z19.d \n\t" \ "fsub z14.d, p5/m, z14.d, z20.d \n\t" \ @@ -715,7 +718,6 @@ asm ( \ // ZERO_PSI #define ZERO_PSI_A64FXd \ asm ( \ - "ptrue p5.d \n\t" \ "fmov z0.d , 0 \n\t" \ "fmov z1.d , 0 \n\t" \ "fmov z2.d , 0 \n\t" \ @@ -733,13 +735,13 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ { \ asm ( \ - "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + "dc zva, %[fetchptr]\n\t" \ + "dc zva, %[fetchptr]\n\t" \ + "dc zva, %[fetchptr]\n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h index d809f83b..e629f617 100644 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ b/Grid/simd/Fujitsu_A64FX_asm_single.h @@ -38,10 +38,11 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXf -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ZERO_PSI ZERO_PSI_A64FXf #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJ XP_PROJ_A64FXf #define YP_PROJ YP_PROJ_A64FXf @@ -70,11 +71,18 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { PERMUTE; } // DECLARATIONS #define DECLARATIONS_A64FXf \ + uint64_t baseU; \ const uint32_t lut[4][16] = { \ {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ +asm ( \ + "ptrue p5.s \n\t" \ + : \ + : \ + : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ +); \ asm ( \ "fmov z31.s , 0 \n\t" \ : \ @@ -130,7 +138,7 @@ asm ( \ // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ asm ( \ "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ @@ -149,7 +157,7 @@ asm ( \ // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ @@ -163,12 +171,12 @@ asm ( \ #define LOAD_CHI_A64FXf(base) \ { \ asm ( \ - "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -178,19 +186,18 @@ asm ( \ #define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ { \ asm ( \ - "ptrue p5.s \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (base + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -201,19 +208,18 @@ asm ( \ { \ const SiteSpinor & ref(in[offset]); \ asm ( \ - "ptrue p5.s \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -224,19 +230,18 @@ asm ( \ { \ const SiteSpinor & ref(in[offset]); \ asm ( \ - "ptrue p5.s \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ + "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ + "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ + "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (&ref[2][0]) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -293,17 +298,16 @@ asm ( \ ); // LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +#define LOAD_GAUGE(A) \ { \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ - "ptrue p5.s \n\t" \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -312,14 +316,14 @@ asm ( \ // MULT_2SPIN #define MULT_2SPIN_1_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ asm ( \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ + "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ + "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ + "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ "movprfx z18.s, p5/m, z31.s \n\t" \ "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \ "movprfx z21.s, p5/m, z31.s \n\t" \ @@ -338,9 +342,9 @@ asm ( \ "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \ "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \ "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \ - "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ + "ld1w { z24.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ + "ld1w { z25.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ + "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ : \ : [fetchptr] "r" (baseU + 2 * 3 * 64) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ @@ -560,7 +564,6 @@ asm ( \ #define TM_PROJ_A64FXf \ { \ asm ( \ - "ptrue p5.s \n\t" \ "fsub z12.s, p5/m, z12.s, z18.s \n\t" \ "fsub z13.s, p5/m, z13.s, z19.s \n\t" \ "fsub z14.s, p5/m, z14.s, z20.s \n\t" \ @@ -715,7 +718,6 @@ asm ( \ // ZERO_PSI #define ZERO_PSI_A64FXf \ asm ( \ - "ptrue p5.s \n\t" \ "fmov z0.s , 0 \n\t" \ "fmov z1.s , 0 \n\t" \ "fmov z2.s , 0 \n\t" \ @@ -733,13 +735,13 @@ asm ( \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ ); -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ { \ asm ( \ - "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ + "dc zva, %[fetchptr]\n\t" \ + "dc zva, %[fetchptr]\n\t" \ + "dc zva, %[fetchptr]\n\t" \ : \ : [fetchptr] "r" (base) \ : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 232610f2..361246fc 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -38,10 +38,11 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXd -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ZERO_PSI ZERO_PSI_A64FXd #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJ XP_PROJ_A64FXd #define YP_PROJ YP_PROJ_A64FXd @@ -70,6 +71,7 @@ Author: Nils Meyer #define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } // DECLARATIONS #define DECLARATIONS_A64FXd \ + uint64_t baseU; \ const uint64_t lut[4][8] = { \ {4, 5, 6, 7, 0, 1, 2, 3}, \ {2, 3, 0, 1, 6, 7, 4, 5}, \ @@ -126,18 +128,18 @@ Author: Nils Meyer // RESULT #define RESULT_A64FXd(base) \ { \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32); \ } // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ @@ -156,7 +158,7 @@ Author: Nils Meyer // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ @@ -170,7 +172,7 @@ Author: Nils Meyer // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ @@ -178,62 +180,62 @@ Author: Nils Meyer // LOAD_CHI #define LOAD_CHI_A64FXd(base) \ { \ - Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64)); \ - Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64)); \ - Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64)); \ - Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64)); \ - Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64)); \ - Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64)); \ + Chi_00 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(0)); \ + Chi_01 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(1)); \ + Chi_02 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(2)); \ + Chi_10 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(3)); \ + Chi_11 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(4)); \ + Chi_12 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(5)); \ } // LOAD_CHIMU #define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ { \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_CHIMU_0213 #define LOAD_CHIMU_0213_A64FXd \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ + Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ } // LOAD_CHIMU_0312 #define LOAD_CHIMU_0312_A64FXd \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_TABLE0 #define LOAD_TABLE0 \ @@ -261,26 +263,26 @@ Author: Nils Meyer Chi_12 = svtbl(Chi_12, table0); // LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +#define LOAD_GAUGE(A) \ { \ - U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ } // MULT_2SPIN #define MULT_2SPIN_1_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ @@ -293,9 +295,9 @@ Author: Nils Meyer UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ + U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-4)); \ + U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-1)); \ + U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(2)); \ } // MULT_2SPIN_BACKEND #define MULT_2SPIN_2_A64FXd \ @@ -570,12 +572,12 @@ Author: Nils Meyer result_31 = svdup_f64(0.); \ result_32 = svdup_f64(0.); -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \ } // PREFETCH_RESULT_L1_STORE (prefetch store to L1) #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 180e5f4f..30273b6e 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -38,10 +38,11 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXf -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ZERO_PSI ZERO_PSI_A64FXf #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJ XP_PROJ_A64FXf #define YP_PROJ YP_PROJ_A64FXf @@ -70,6 +71,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { PERMUTE; } // DECLARATIONS #define DECLARATIONS_A64FXf \ + uint64_t baseU; \ const uint32_t lut[4][16] = { \ {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ @@ -126,18 +128,18 @@ Author: Nils Meyer // RESULT #define RESULT_A64FXf(base) \ { \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32); \ } // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ @@ -156,7 +158,7 @@ Author: Nils Meyer // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ + const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ @@ -170,7 +172,7 @@ Author: Nils Meyer // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ @@ -178,62 +180,62 @@ Author: Nils Meyer // LOAD_CHI #define LOAD_CHI_A64FXf(base) \ { \ - Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64)); \ - Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64)); \ - Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64)); \ - Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64)); \ - Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64)); \ - Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64)); \ + Chi_00 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(0)); \ + Chi_01 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(1)); \ + Chi_02 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(2)); \ + Chi_10 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(3)); \ + Chi_11 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(4)); \ + Chi_12 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(5)); \ } // LOAD_CHIMU #define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ { \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_CHIMU_0213 #define LOAD_CHIMU_0213_A64FXf \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ + Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ } // LOAD_CHIMU_0312 #define LOAD_CHIMU_0312_A64FXf \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_TABLE0 #define LOAD_TABLE0 \ @@ -261,26 +263,26 @@ Author: Nils Meyer Chi_12 = svtbl(Chi_12, table0); // LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +#define LOAD_GAUGE(A) \ { \ - U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ } // MULT_2SPIN #define MULT_2SPIN_1_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ @@ -293,9 +295,9 @@ Author: Nils Meyer UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ + U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-4)); \ + U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-1)); \ + U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(2)); \ } // MULT_2SPIN_BACKEND #define MULT_2SPIN_2_A64FXf \ @@ -570,12 +572,12 @@ Author: Nils Meyer result_31 = svdup_f32(0.); \ result_32 = svdup_f32(0.); -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \ } // PREFETCH_RESULT_L1_STORE (prefetch store to L1) #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \ diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h index 81eec37a..51762a60 100644 --- a/Grid/simd/Fujitsu_A64FX_undef.h +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -46,6 +46,7 @@ Author: Nils Meyer #undef MULT_2SPIN_2 #undef MAYBEPERM #undef LOAD_CHI +#undef ZERO_PSI #undef XP_PROJ #undef YP_PROJ #undef ZP_PROJ diff --git a/Grid/simd/gridverter.py b/Grid/simd/gridverter.py deleted file mode 100755 index f00a5019..00000000 --- a/Grid/simd/gridverter.py +++ /dev/null @@ -1,2377 +0,0 @@ -#!/usr/bin/python3 - -import re -import argparse -import sys - -# Grid for A64FX -# -# * should align std::vector to (multiples of) cache block size = 256 bytes - -# place benchmark runtime in cycles here ! -measured_cycles = 690 #1500 #775 #1500 - - -# command line parser -parser = argparse.ArgumentParser(description="Dslash generator.") -parser.add_argument("--single", action="store_true", default="False") -parser.add_argument("--double", action="store_true", default="True") -parser.add_argument("--debug", action="store_true", default="False") -parser.add_argument("--gridbench", action="store_true", default="False") -args = parser.parse_args() - -print(args) - -ASM_LOAD_CHIMU = True # load chimu -ASM_LOAD_GAUGE = True # load gauge -ASM_LOAD_TABLE = True # load table -ASM_STORE = True # store result - -# Disable all loads and stores in asm for benchmarking purposes -#DISABLE_ASM_LOAD_STORE = True -DISABLE_ASM_LOAD_STORE = False - -if DISABLE_ASM_LOAD_STORE: - ASM_LOAD_CHIMU = True # load chimu - ASM_LOAD_GAUGE = True # load gauge - ASM_LOAD_TABLE = True # load table - ASM_STORE = False # store result - -# Alternative implementation using PROJ specific loads works, -# but be careful with predication - -ALTERNATIVE_LOADS = False -#ALTERNATIVE_LOADS = not ALTERNATIVE_LOADS # True - -# Alternative register mapping, -# must use with my_wilson4.h and my_wilson4pf.h - -ALTERNATIVE_REGISTER_MAPPING = False -#ALTERNATIVE_REGISTER_MAPPING = not ALTERNATIVE_REGISTER_MAPPING - -if ALTERNATIVE_REGISTER_MAPPING == True: - ALTERNATIVE_LOADS = False - -# use movprfx -MOVPRFX = False -MOVPRFX = not MOVPRFX - - -PREFETCH = False -PREFETCH = not PREFETCH # True - -PRECISION = 'double' # DP by default -PRECSUFFIX = 'A64FXd' -if args.single == True: - PRECISION = 'single' - PRECSUFFIX = 'A64FXf' - -_DEBUG = False #True # insert debugging output -if args.debug == True: - _DEBUG = True - -GRIDBENCH = False -if args.gridbench == True: - GRIDBENCH = True - -print("PRECISION = ", PRECISION) -print("DEBUG = ", _DEBUG) -print("ALTERNATIVE_LOADS = ", ALTERNATIVE_LOADS) -print("ALTERNATIVE_REGISTER_MAPPING = ", ALTERNATIVE_REGISTER_MAPPING) -print("MOVPRFX = ", MOVPRFX) -print("DISABLE_ASM_LOAD_STORE = ", DISABLE_ASM_LOAD_STORE) -print("GRIDBENCH = ", GRIDBENCH) - -print("") - -#sys.exit(0) - - -#_DEBUG = True # insert debugging output - -FETCH_BASE_PTR_COLOR_OFFSET = 2 # offset for scalar plus signed immediate addressing -STORE_BASE_PTR_COLOR_OFFSET = 2 - -# 64-bit gp register usage !!! armclang 20.0 complains about the register choice !!! -# table address: x30 -# data address: x29 -# store address: x28 -# debug address: r8 - -# Max performance of complex FMA using FCMLA instruction -# is 25% peak. -# -# Issue latency of FCMLA is 2 cycles. -# Need 2 FCMLA instructions for complex FMA. -# Complete complex FMA takes 4 cycles. -# Peak throughput is 4 * 8 Flops DP = 32 Flops DP in 4 cycles. -# A64FX FMA throughput is 4 * 8 * 2 * 2 = 132 Flops DP in 4 cycles. -# -> 25% peak FMA -# -# In: 3x 512 bits = 192 bytes -# Out: 1x 512 bits = 64 bytes -# Tot: 4x 512 bits = 256 bytes -# -# 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2) - -OPT = """ -* interleave prefetching and compute in MULT_2SPIN -* could test storing U's in MULT_2SPIN to L1d for cache line update -* structure reordering: MAYBEPERM after MULT_2SPIN ? -""" - -filename = 'XXX' -LEGAL = """/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: {} - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -""" - -class Register: - - def __init__(self, variable, asmreg='X', predication=False): - global d - x = 'Y' - if predication == False: - x = asmreg # + d['asmsuffix'] - else: - x = asmreg - self.asmreg = x - self.asmregwithsuffix = asmreg + d['asmsuffix'] - self.asmregbyte = asmreg + '.b' - self.name = variable - self.asmname = variable - self.asmnamebyte = variable + '.b' - self.predication = predication - - d['registers'] += 1 - - def define(self, statement): - global d - d['C'] += F'#define {self.name} {statement}' - #d['A'] += F'#define {self.name} {statement}' - - def declare(self, predication=False): - global d - - if self.predication == False: - d['C'] += F' Simd {self.name}; \\\n' - - predtype = 'svfloat64_t' - if PRECISION == 'single': - predtype = 'svfloat32_t' - - d['I'] += F' {predtype} {self.name}; \\\n' - else: - d['I'] += F' svbool_t {self.name}; \\\n' - #d['A'] += F'#define {self.name} {self.asmreg} \n' - - def loadpredication(self, target='A'): - global d - if (target == 'A'): - d['A'] += F' "ptrue {self.asmregwithsuffix} \\n\\t" \\\n' - d['asmclobber'].append(F'"{self.asmreg}"') - - def loadtable(self, t): - global d - d['load'] += d['factor'] - gpr = d['asmtableptr'] - - cast = 'uint64_t' - #asm_opcode = 'ld1d' - #if PRECISION == 'single': - # asm_opcode = 'ld1w' - # cast = 'uint32_t' - asm_opcode = 'ldr' - if PRECISION == 'single': - asm_opcode = 'ldr' - cast = 'uint32_t' - - d['I'] += F' {self.name} = svld1(pg1, ({cast}*)&lut[{t}]); \\\n' - - # using immediate index break-out works - if asm_opcode == 'ldr': - # ldr version - d['A'] += F' "{asm_opcode} {self.asmreg}, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' - else: - # ld1 version - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' - - d['asminput'].append(F'[tableptr] "r" (&lut[0])') - d['asminput'].append(F'[index] "i" ({t})') - d['asmclobber'].append(F'"memory"') - d['asmclobber'].append(F'"cc"') - - def load(self, address, target='ALL', cast='float64_t', colors=3, offset=FETCH_BASE_PTR_COLOR_OFFSET): - global d - d['load'] += d['factor'] - indices = re.findall(r'\d+', address) - index = (int(indices[0]) - offset) * colors + int(indices[1]) - - #asm_opcode = 'ld1d' - #if PRECISION == 'single': - #asm_opcode = 'ld1w' - # cast = 'float32_t' - - asm_opcode = 'ldr' - if PRECISION == 'single': - asm_opcode = 'ldr' - cast = 'float32_t' - - gpr = d['asmfetchbaseptr'] - intrinfetchbase = d['intrinfetchbase'] - if (target in ['ALL', 'C']): - d['C'] += F' {self.name} = {address}; \\\n' - if (target in ['ALL', 'I']): -# d['I'] += F' {self.name} = svldnt1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' - d['I'] += F' {self.name} = svld1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' - if (target in ['ALL', 'A']): - if asm_opcode == 'ldr': - d['A'] += F' "{asm_opcode} {self.asmreg}, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' - else: - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' - - def store(self, address, cast='float64_t', colors=3, offset=STORE_BASE_PTR_COLOR_OFFSET): - global d - d['store'] += d['factor'] - indices = re.findall(r'\d+', address) - index = (int(indices[0]) - offset) * colors + int(indices[1]) - - #asm_opcode = 'stnt1d' - #if PRECISION == 'single': - # asm_opcode = 'stnt1w' - # cast = 'float32_t' - asm_opcode = 'str' - if PRECISION == 'single': - asm_opcode = 'str' - cast = 'float32_t' - - intrinstorebase = d['intrinstorebase'] - - d['C'] += F' {address} = {self.name}; \\\n' - #d['I'] += F' svstnt1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' - d['I'] += F' svst1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' - if asm_opcode == 'str': - d['A'] += F' "{asm_opcode} {self.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' - else: - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' - - def movestr(self, str): - global d - #d['move'] += d['factor'] - d['I'] += F' {self.name} = {str}; \\\n' - - def move(self, op1): - global d - d['move'] += d['factor'] - d['C'] += F' {self.name} = {op1.name}; \\\n' - d['I'] += F' {self.name} = {op1.name}; \\\n' - d['A'] += F' "mov {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - - # a = a + b , a = b + c - def add(self, op1, op2=None): - global d - d['add'] += d['factor'] - if op2 is None: - d['C'] += F' {self.name} = {self.name} + {op1.name}; \\\n' - d['I'] += F' {self.name} = svadd_x(pg1, {self.name}, {op1.name}); \\\n' - d['A'] += F' "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} + {op2.name}; \\\n' - d['I'] += F' {self.name} = svadd_x(pg1, {op1.name}, {op2.name}); \\\n' - d['A'] += F' "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n' - - # a = a -b , a = b - c - def sub(self, op1, op2=None): - global d - d['sub'] += d['factor'] - if op2 is None: - d['C'] += F' {self.name} = {self.name} - {op1.name}; \\\n' - d['I'] += F' {self.name} = svsub_x(pg1, {self.name}, {op1.name}); \\\n' - d['A'] += F' "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} - {op2.name}; \\\n' - d['I'] += F' {self.name} = svsub_x(pg1, {op1.name}, {op2.name}); \\\n' - d['A'] += F' "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n' - - # a = a * b , a = b * c - def mul(self, op1, op2): - global d - d['mul'] += 2 * d['factor'] - d['C'] += F' {self.name} = {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = __svzero({self.name}); \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "mov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def mul0(self, op1, op2, op3=None, constructive=False): - global d - d['mul'] += d['factor'] - - # no movprfx intrinsics support - if constructive == True: - d['movprfx'] += d['factor'] - d['I'] += F' {self.name} = svcmla_x(pg1, {op1.name}, {op2.name}, {op3.name}, 0); \\\n' - d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op2.asmregwithsuffix}, {op3.asmregwithsuffix}, 0 \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - - def mul1(self, op1, op2): - global d - d['mul'] += d['factor'] - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def mac(self, op1, op2): - global d - d['mac'] += 2 * d['factor'] - d['C'] += F' {self.name} = {self.name} + {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def mac0(self, op1, op2): - global d - d['mac'] += d['factor'] - d['C'] += F' {self.name} = {self.name} + {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - - def mac1(self, op1, op2): - global d - d['mac'] += d['factor'] - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def zero(self, zeroreg=False): - d['zero'] += d['factor'] - d['C'] += F' {self.name} = 0; \\\n' - #d['I'] += F' {self.name} = __svzero({self.name}); \\\n' only armclang - - if PRECISION == 'double': - d['I'] += F' {self.name} = svdup_f64(0.); \\\n' - else: - d['I'] += F' {self.name} = svdup_f32(0.); \\\n' - - if zeroreg == True: - d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' - else: - #using mov z, zero0 issue 1c, FLA, latency 6c - #d['A'] += F' "mov {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n' - - #using mov z, 0 issue 1c, FLA, latency 6c - d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' - - #using xor z, z, z issue 0.5c, FL*, latency 4c - #d['A'] += F' "eor {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n' - - #using and z, z, zero0 issue 0.5c, FL*, latency 4c - #d['A'] += F' "and {self.asmregwithsuffix}, {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n' - - #using sub z, z, z issue 0.5c, FL*, latency 9c - #d['A'] += F' "sub {self.asmregwithsuffix}, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n' - - # without table - def timesI(self, op1, tempreg=None, tablereg=None): - global d - d['timesI'] += d['factor'] - d['C'] += F' {self.name} = timesI({op1.name}); \\\n' - # correct if DEBUG enabled, wrong if DEBUG disabled; no idea what's causing this - #table.load('table2', target='I', cast='uint64_t') - #d['I'] += F' {self.name} = svtbl({op1.name}, {tablereg.name}); \\\n' - #d['I'] += F' {self.name} = svneg_x(pg2, {self.name}); \\\n' - # timesI using trn tested, works but tbl should be faster - d['I'] += F' {tempreg.name} = svtrn2({op1.name}, {op1.name}); \\\n' - d['I'] += F' {tempreg.name} = svneg_x(pg1, {tempreg.name}); \\\n' - d['I'] += F' {self.name} = svtrn1({tempreg.name}, {op1.name}); \\\n' - d['A'] += F' "trn2 {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fneg {tempreg.asmregwithsuffix}, {pg1.asmreg}/m, {tempreg.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "trn1 {self.asmregwithsuffix}, {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - - def addTimesI(self, op1, op2=None, constructive=False): - global d - d['addTimesI'] += d['factor'] - - if op2 is None: - d['C'] += F' {self.name} = {self.name} + timesI({op1.name}); \\\n' - else: - d['C'] += F' {self.name} = {op1.name} + timesI({op2.name}); \\\n' - - # no movprfx intrinsics support - if constructive == True: - d['movprfx'] += d['factor'] - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - else: - if op2 is None: - d['C'] += F' {self.name} = {self.name} + timesI({op1.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 90); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 90 \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} + timesI({op2.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def subTimesI(self, op1, op2=None, constructive=False): - global d - d['subTimesI'] += d['factor'] - - # no movprfx intrinsics support - if constructive == True: - d['movprfx'] += d['factor'] - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270); \\\n' - d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n' - else: - if op2 is None: - d['C'] += F' {self.name} = {self.name} - timesI({op1.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 270); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 270 \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} - timesI({op2.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n' - - # timesMinusI is not used, def is probably wrong !!!! OPTIMIZATION with table - def timesMinusI(self, op1): - global d - d['timesMinusI'] += d['factor'] - d['C'] += F' {self.name} = timesMinusI({self.name}); \\\n' - d['I'] += F' {self.name} = svtrn1({op1.name}, {op1.name}); \\\n' - d['I'] += F' {self.name} = svneg_x(pg1, {self.name}); \\\n' - d['I'] += F' {self.name} = svtrn1({op1.name}, {self.name}); \\\n' - - def permute(self, dir, tablereg=None): - global d - d['permutes'] += d['factor'] - - d['C'] += F' permute{dir}({self.name}, {self.name}); \\\n' - - d['I'] += F' {self.name} = svtbl({self.name}, {tablereg.name}); \\\n' - d['A'] += F' "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t" \\\n' - - # if dir == 0: - # d['I'] += F' {self.name} = svext({self.name}, {self.name}, 4); \\\n' - # # this might not work, see intrinsics assembly - # # d['A'] += F' ext {self.name}, {self.name}, {self.name}, #4 \\\n' - # # use registers directly - # d['A'] += F' "ext {self.asmregbyte}, {self.asmregbyte}, {self.asmregbyte}, 32 \\n\\t" \\\n' - # - # elif dir in [1, 2]: - # d['I'] += F' {self.name} = svtbl({self.name}, {tablereg.name}); \\\n' - # d['A'] += F' "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t" \\\n' - - def debug(self): - global d - typecast = d['cfloat'] - gpr = d['asmdebugptr'] - vregs = d['asmclobberlist'] - if (d['debug'] == True): - d['C'] += F'std::cout << "{self.name} -- " << {self.name} << std::endl; \\\n' - - d['I'] += F'svst1(pg1, ({typecast}*)&debugreg.v, {self.name}); \\\n' - d['I'] += F'std::cout << "{self.name} -- " << debugreg << std::endl; \\\n' - #d['I'] += F'std::cout << "{self.name} -- " << {self.name} << std::endl; \\\n' - - d['A'] += F'asm ( \\\n' - d['A'] += F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n' # memory barrier - d['A'] += F' "str {self.asmreg}, [%[ptr]] \\n\\t" \\\n' - d['A'] += F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n' # memory barrier - d['A'] += F' : "=m" (debugreg.v) \\\n' - d['A'] += F' : [ptr] "r" (&debugreg.v) \\\n' - d['A'] += F' : "p5", "cc", "memory" \\\n' - d['A'] += F'); \\\n' - d['A'] += F'std::cout << "{self.name} -- " << debugreg << std::endl; \\\n' - # this form of addressing is not valid! - #d['A'] += F' "str {self.asmreg}, %[ptr] \\n\\t" \\\n' -# end Register - -def define(s, target='ALL'): - x = F'#define {s} \n' - global d - if (target in ['ALL', 'C']): - d['C'] += x - if (target in ['ALL', 'I']): - d['I'] += x - if (target in ['ALL', 'A']): - d['A'] += x - -def definemultiline(s): - x = F'#define {s} \\\n' - global d - d['C'] += x - d['I'] += x - d['A'] += x - -def write(s, target='ALL'): - x = F'{s}\n' - global d - if (target in ['ALL', 'C']): - d['C'] += x - if (target in ['ALL', 'I']): - d['I'] += x - if (target in ['ALL', 'A']): - d['A'] += x - -def curlyopen(): - write(F'{{ \\') - -def curlyclose(): - write(F'}}') - -def newline(target='ALL'): - global d - - if target == 'A': - if d['A'][-2:] == '\\\n': - d['A'] = d['A'][:-2] + '\n\n' - else: - if d['C'][-2:] == '\\\n': - d['C'] = d['C'][:-2] + '\n\n' - if d['I'][-2:] == '\\\n': - d['I'] = d['I'][:-2] + '\n\n' - if d['A'][-2:] == '\\\n': - d['A'] = d['A'][:-2] + '\n\n' - -# load the base pointer for fetches -def fetch_base_ptr(address, target='A'): - global d - #d['load'] += d['factor'] - - # DEBUG - #colors=3 - #indices = re.findall(r'\d+', address) - #index = (int(indices[0]) - FETCH_BASE_PTR_COLOR_OFFSET) * colors + int(indices[1]) - #print(F'{address} (base)') - - vregs = d['asmclobberlist'] - if target == 'A': - d['asminput'].append(F'[fetchptr] "r" ({address})') - d['asmclobber'].extend(vregs) - d['asmclobber'].append(F'"memory"') - d['asmclobber'].append(F'"cc"') - if target == 'I': - #print("intrinfetchbase = ", address) - d['intrinfetchbase'] = address - -# load the base pointer for stores -def store_base_ptr(address, target='A'): - global d - #d['load'] += d['factor'] - gpr = d['asmstorebaseptr'] - vregs = d['asmclobberlist'] - if target == 'A': - d['asminput'].append(F'[storeptr] "r" ({address})') - d['asmclobber'].extend(vregs) - d['asmclobber'].append(F'"memory"') - d['asmclobber'].append(F'"cc"') - if target == 'I': - d['intrinstorebase'] = address - -def prefetch_L1(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PLDL1STRM" # weak - #policy = "PLDL1KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - -def prefetch_L2(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PLDL2STRM" # weak - #policy = "PLDL2KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - #d['A'] += - -def prefetch_L2_store(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PSTL2STRM" # weak - #policy = "PSTL2KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - -def prefetch_L1_store(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PSTL1STRM" # weak - #policy = "PSTL2KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - - -def asmopen(): - #write('asm volatile ( \\', target='A') - write('asm ( \\', target='A') - - # DEBUG - #write(F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A') # memory barrier - #write('asm volatile ( \\', target='A') - -def asmclose(): - global d - - #print(d['asminput']) - - asmin = d['asminput'] - asmin_s = '' - if len(asmin) > 0: - asmin = list(dict.fromkeys(asmin)) # remove duplicates - #print(asmin) - for el in asmin: - asmin_s += el + ',' - asmin_s = asmin_s[:-1] - #print("-> ", asmin_s) - - d['asminput'] = [] - - asmout = d['asmoutput'] - asmout_s = '' - if len(asmout) > 0: - asmout = list(dict.fromkeys(asmout)) # remove duplicates - for el in asmout: - asmout_s += el + ',' - asmout_s = asmout_s[:-1] - - d['asmoutput'] = [] - - # DEBUG put all regs into clobber by default - d['asmclobber'].extend(d['asmclobberlist']) - - asmclobber = d['asmclobber'] - asmclobber_s = '' - #print(asmclobber) - if len(asmclobber) > 0: - asmclobber = list(dict.fromkeys(asmclobber)) # remove duplicates - for el in asmclobber: - asmclobber_s += el + ',' - asmclobber_s = asmclobber_s[:-1] - - d['asmclobber'] = [] - - # DEBUG - #write(F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A') # memory barrier - - - write(F' : {asmout_s} \\', target='A') - write(F' : {asmin_s} \\', target='A') - write(F' : {asmclobber_s} \\', target='A') - write('); \\', target='A') - -# -------------------------------------------------------------------------------- - -# string of vector registers to be used in clobber list -#clobberlist = ['"p0"'] -clobberlist = ['"p5"'] -clobberlist.append('"cc"') -for i in range(0, 32): - clobberlist.append(F'"z{i}"') - -d = { -'debug': _DEBUG, -'C': '', -'I': '', -'A': '', -'asmsuffix': '.d', # double precision by default -'cfloat': 'float64_t', -'registers': 0, -'load': 0, -'store': 0, -'move': 0, -'movprfx': 0, -'zero': 0, -'add': 0, -'sub': 0, -'mul': 0, -'mac': 0, -'permutes': 0, -'neg': 0, -'addTimesI': 0, -'subTimesI': 0, -'timesI': 0, -'timesMinusI': 0, -'flops': 0, -'factor': 1, # multiplicity -'asmtableptr': 'x30', -'asmfetchbaseptr': 'x29', -'asmstorebaseptr': 'x28', -'asmdebugptr': 'r12', -'asminput': [], -'asmoutput': [], -'asmclobber': [], -'asmclobberlist': clobberlist, -'intrinfetchbase': '', -'intrinstorebase': '', -'cycles_LOAD_CHIMU': 0, -'cycles_PROJ': 0, -'cycles_PERM': 0, -'cycles_MULT_2SPIN': 0, -'cycles_RECON': 0, -'cycles_RESULT': 0, -'cycles_ZERO_PSI': 0, -'cycles_PREFETCH_L1': 0, -'cycles_PREFETCH_L2': 0 -} - -if PRECISION == 'single': - d['asmsuffix'] = '.s' - d['cfloat'] = 'float32_t' - -# -------------------------------------------------------------------------------- -# Grid -# -------------------------------------------------------------------------------- - -# Variables / Registers -result_00 = Register('result_00', asmreg='z0') -result_01 = Register('result_01', asmreg='z1') -result_02 = Register('result_02', asmreg='z2') -result_10 = Register('result_10', asmreg='z3') -result_11 = Register('result_11', asmreg='z4') -result_12 = Register('result_12', asmreg='z5') -result_20 = Register('result_20', asmreg='z6') -result_21 = Register('result_21', asmreg='z7') -result_22 = Register('result_22', asmreg='z8') -result_30 = Register('result_30', asmreg='z9') -result_31 = Register('result_31', asmreg='z10') -result_32 = Register('result_32', asmreg='z11') # 12 Regs -Chi_00 = Register('Chi_00', asmreg='z12') -Chi_01 = Register('Chi_01', asmreg='z13') -Chi_02 = Register('Chi_02', asmreg='z14') -Chi_10 = Register('Chi_10', asmreg='z15') -Chi_11 = Register('Chi_11', asmreg='z16') -Chi_12 = Register('Chi_12', asmreg='z17') # 6 -UChi_00 = Register('UChi_00', asmreg='z18') -UChi_01 = Register('UChi_01', asmreg='z19') -UChi_02 = Register('UChi_02', asmreg='z20') -UChi_10 = Register('UChi_10', asmreg='z21') -UChi_11 = Register('UChi_11', asmreg='z22') -UChi_12 = Register('UChi_12', asmreg='z23') # 6 -U_00 = Register('U_00', asmreg='z24') -U_10 = Register('U_10', asmreg='z25') -U_20 = Register('U_20', asmreg='z26') -U_01 = Register('U_01', asmreg='z27') -U_11 = Register('U_11', asmreg='z28') -U_21 = Register('U_21', asmreg='z29') # 6 -> 30 Registers - -table0 = Register('table0', asmreg='z30') -zero0 = Register('zero0', asmreg='z31') # 2 -> 32 Registers -# can't overload temp1 / table due to type mismatch using intrinsics :( -# typecasting SVE intrinsics variables is not allowed - -pg1 = Register('pg1', predication=True, asmreg='p5') -#pg2 = Register('pg2', predication=True, asmreg='p1') - -# Overloaded with Chi_* and UChi_* -Chimu_00 = Register('Chimu_00', asmreg=Chi_00.asmreg) -Chimu_01 = Register('Chimu_01', asmreg=Chi_01.asmreg) -Chimu_02 = Register('Chimu_02', asmreg=Chi_02.asmreg) -Chimu_10 = Register('Chimu_10', asmreg=Chi_10.asmreg) -Chimu_11 = Register('Chimu_11', asmreg=Chi_11.asmreg) -Chimu_12 = Register('Chimu_12', asmreg=Chi_12.asmreg) -if ALTERNATIVE_REGISTER_MAPPING == False: - Chimu_20 = Register('Chimu_20', asmreg=UChi_00.asmreg) - Chimu_21 = Register('Chimu_21', asmreg=UChi_01.asmreg) - Chimu_22 = Register('Chimu_22', asmreg=UChi_02.asmreg) - Chimu_30 = Register('Chimu_30', asmreg=UChi_10.asmreg) - Chimu_31 = Register('Chimu_31', asmreg=UChi_11.asmreg) - Chimu_32 = Register('Chimu_32', asmreg=UChi_12.asmreg) # 12 Registers -else: # wilson4.h - Chimu_20 = Register('Chimu_20', asmreg=U_00.asmreg) - Chimu_21 = Register('Chimu_21', asmreg=U_10.asmreg) - Chimu_22 = Register('Chimu_22', asmreg=U_20.asmreg) - Chimu_30 = Register('Chimu_30', asmreg=U_01.asmreg) - Chimu_31 = Register('Chimu_31', asmreg=U_11.asmreg) - Chimu_32 = Register('Chimu_32', asmreg=U_21.asmreg) - -# debugging output -def debugall(msg=None, group='ALL'): - global d - if (d['debug'] == False): - return - write(F'std::cout << std::endl << "DEBUG -- {msg}" << std::endl; \\') - if (group in ['ALL', 'result']): - result_00.debug() - result_01.debug() - result_02.debug() - result_10.debug() - result_11.debug() - result_12.debug() - result_20.debug() - result_21.debug() - result_22.debug() - result_30.debug() - result_31.debug() - result_32.debug() - if (group in ['ALL', 'Chi']): - Chi_00.debug() - Chi_01.debug() - Chi_02.debug() - Chi_10.debug() - Chi_11.debug() - Chi_12.debug() - if (group in ['ALL', 'UChi']): - UChi_00.debug() - UChi_01.debug() - UChi_02.debug() - UChi_10.debug() - UChi_11.debug() - UChi_12.debug() - if (group in ['ALL', 'U']): - U_00.debug() - U_10.debug() - U_20.debug() - U_01.debug() - U_11.debug() - U_21.debug() - if (group in ['ALL', 'Chimu']): - Chimu_00.debug() - Chimu_01.debug() - Chimu_02.debug() - Chimu_10.debug() - Chimu_11.debug() - Chimu_12.debug() - Chimu_20.debug() - Chimu_21.debug() - Chimu_22.debug() - Chimu_30.debug() - Chimu_31.debug() - Chimu_32.debug() - -# -------------------------------------------------------------------------------- -# Output -# -------------------------------------------------------------------------------- - -if ALTERNATIVE_LOADS == True: - define(F'LOAD_CHIMU_0213_PLUG LOAD_CHIMU_0213_{PRECSUFFIX}') - define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}') - define(F'LOAD_CHIMU(x)') -else: - #define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)') - define(F'LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') - -if PREFETCH: - define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') - define(F'PF_GAUGE(A)') - define(F'PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)') -# define(F'PREFETCH1_CHIMU(A)') - define(F'PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)') -# define(F'PREFETCH_CHIMU(A)') -else: - define(F'PREFETCH_CHIMU_L1(A)') - define(F'PREFETCH_GAUGE_L1(A)') - define(F'PREFETCH_CHIMU_L2(A)') - define(F'PREFETCH_GAUGE_L2(A)') - define(F'PF_GAUGE(A)') - define(F'PREFETCH1_CHIMU(A)') - define(F'PREFETCH_CHIMU(A)') - define(F'PREFETCH_RESULT_L2_STORE(A)') - -# standard defines -define(F'LOCK_GAUGE(A)') -define(F'UNLOCK_GAUGE(A)') -define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}') -define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B)') -define(F'MULT_2SPIN_1(Dir) MULT_2SPIN_1_{PRECSUFFIX}(Dir)') -define(F'MULT_2SPIN_2 MULT_2SPIN_2_{PRECSUFFIX}') -define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)') -# don't need zero psi, everything is done in recons -#define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}') -define(F'ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)') -# loads projections -define(F'XP_PROJ XP_PROJ_{PRECSUFFIX}') -define(F'YP_PROJ YP_PROJ_{PRECSUFFIX}') -define(F'ZP_PROJ ZP_PROJ_{PRECSUFFIX}') -define(F'TP_PROJ TP_PROJ_{PRECSUFFIX}') -define(F'XM_PROJ XM_PROJ_{PRECSUFFIX}') -define(F'YM_PROJ YM_PROJ_{PRECSUFFIX}') -define(F'ZM_PROJ ZM_PROJ_{PRECSUFFIX}') -define(F'TM_PROJ TM_PROJ_{PRECSUFFIX}') -# recons -define(F'XP_RECON XP_RECON_{PRECSUFFIX}') -define(F'XM_RECON XM_RECON_{PRECSUFFIX}') -define(F'XM_RECON_ACCUM XM_RECON_ACCUM_{PRECSUFFIX}') -define(F'YM_RECON_ACCUM YM_RECON_ACCUM_{PRECSUFFIX}') -define(F'ZM_RECON_ACCUM ZM_RECON_ACCUM_{PRECSUFFIX}') -define(F'TM_RECON_ACCUM TM_RECON_ACCUM_{PRECSUFFIX}') -define(F'XP_RECON_ACCUM XP_RECON_ACCUM_{PRECSUFFIX}') -define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}') -define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}') -define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}') -# new permutes -define(F'PERMUTE_DIR0 0') -define(F'PERMUTE_DIR1 1') -define(F'PERMUTE_DIR2 2') -define(F'PERMUTE_DIR3 3') -define(F'PERMUTE PERMUTE_{PRECSUFFIX};') -# load table -#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}') -if PRECISION == 'double': - define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1; }} else if (Dir == 2) {{ LOAD_TABLE2; }}') - define(F'MAYBEPERM(Dir,perm) if (Dir != 3) {{ if (perm) {{ PERMUTE; }} }}') -else: - define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1 }} else if (Dir == 2) {{ LOAD_TABLE2; }} else if (Dir == 3) {{ LOAD_TABLE3; }}') - define(F'MAYBEPERM(A,perm) if (perm) {{ PERMUTE; }}') - - - -write('// DECLARATIONS') -definemultiline(F'DECLARATIONS_{PRECSUFFIX}') -# debugging register -if d['debug'] == True: - write(' Simd debugreg; \\') -# perm tables -if PRECISION == 'double': - write(' const uint64_t lut[4][8] = { \\') - write(' {4, 5, 6, 7, 0, 1, 2, 3}, \\') #0 = swap register halves - write(' {2, 3, 0, 1, 6, 7, 4, 5}, \\') #1 = swap halves of halves - write(' {1, 0, 3, 2, 5, 4, 7, 6}, \\') #2 = swap re/im - write(' {0, 1, 2, 4, 5, 6, 7, 8} };\\') #3 = identity -else: - write(' const uint32_t lut[4][16] = { \\') - write(' {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \\') #0 = swap register halves - write(' {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \\') #1 = swap halves of halves - write(' {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \\') #2 = swap halves of halves of halves - write(' {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \\') #3 = swap re/im - -#newline(target='A') -result_00.declare() -result_01.declare() -result_02.declare() -result_10.declare() -result_11.declare() -result_12.declare() -result_20.declare() -result_21.declare() -result_22.declare() -result_30.declare() -result_31.declare() -result_32.declare() # 12 -Chi_00.declare() -Chi_01.declare() -Chi_02.declare() -Chi_10.declare() -Chi_11.declare() -Chi_12.declare() # 6 -UChi_00.declare() -UChi_01.declare() -UChi_02.declare() -UChi_10.declare() -UChi_11.declare() -UChi_12.declare() # 6 -U_00.declare() -U_10.declare() -U_20.declare() -U_01.declare() -U_11.declare() -U_21.declare() # 6 -> 30 regs - -# all predications true -pg1.declare() -if PRECISION == 'double': - pg1.movestr('svptrue_b64()') -else: - pg1.movestr('svptrue_b32()') - -# tables -if PRECISION == 'double': - write(' svuint64_t table0; \\', target='I') # -> 31 regs -else: - write(' svuint32_t table0; \\', target='I') # -> 31 regs - -zero0.declare() - -# zero register -asmopen() -zero0.zero(zeroreg=True) -asmclose() -newline() - -define('Chimu_00 Chi_00', target='I') -define('Chimu_01 Chi_01', target='I') -define('Chimu_02 Chi_02', target='I') -define('Chimu_10 Chi_10', target='I') -define('Chimu_11 Chi_11', target='I') -define('Chimu_12 Chi_12', target='I') -if ALTERNATIVE_REGISTER_MAPPING == False: - define('Chimu_20 UChi_00', target='I') - define('Chimu_21 UChi_01', target='I') - define('Chimu_22 UChi_02', target='I') - define('Chimu_30 UChi_10', target='I') - define('Chimu_31 UChi_11', target='I') - define('Chimu_32 UChi_12', target='I') -else: # wilson4.h - define('Chimu_20 U_00', target='I') - define('Chimu_21 U_10', target='I') - define('Chimu_22 U_20', target='I') - define('Chimu_30 U_01', target='I') - define('Chimu_31 U_11', target='I') - define('Chimu_32 U_21', target='I') -newline() - - -d['cycles_RESULT'] += 12 -write('// RESULT') -definemultiline(F'RESULT_{PRECSUFFIX}(base)') -if ASM_STORE: - curlyopen() - #write(' SiteSpinor & ref(out[ss]); \\') - asmopen() - #pg1.loadpredication() - #store_base_ptr("&ref[0][0]") - #store_base_ptr(F"&ref[{STORE_BASE_PTR_COLOR_OFFSET}][0]") - store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') - store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - result_00.store("ref[0][0]") - result_01.store("ref[0][1]") - result_02.store("ref[0][2]") - result_10.store("ref[1][0]") - result_11.store("ref[1][1]") - result_12.store("ref[1][2]") - result_20.store("ref[2][0]") - result_21.store("ref[2][1]") - result_22.store("ref[2][2]") - result_30.store("ref[3][0]") - result_31.store("ref[3][1]") - result_32.store("ref[3][2]") - asmclose() - debugall('RESULT', group='result') - curlyclose() -newline() - -# prefetch spinors from memory into L2 cache -d['factor'] = 0 -d['cycles_PREFETCH_L2'] += 0 * d['factor'] -write('// PREFETCH_CHIMU_L2 (prefetch to L2)') -definemultiline(F'PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -#pg1.loadpredication() -#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") -fetch_base_ptr(F"base", target='A') -prefetch_L2(F"base", 0) -prefetch_L2(F"base", 1) -prefetch_L2(F"base", 2) -asmclose() -curlyclose() -newline() - -# prefetch spinors from memory into L1 cache -d['factor'] = 0 -d['cycles_PREFETCH_L1'] += 0 * d['factor'] -write('// PREFETCH_CHIMU_L1 (prefetch to L1)') -definemultiline(F'PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -#pg1.loadpredication() -fetch_base_ptr(F"base", target='A') -prefetch_L1(F"base", 0) -prefetch_L1(F"base", 1) -prefetch_L1(F"base", 2) -asmclose() -curlyclose() -newline() - -# prefetch gauge from memory into L2 cache -d['factor'] = 0 -d['cycles_PREFETCH_L2'] += 0 * d['factor'] -write('// PREFETCH_GAUGE_L2 (prefetch to L2)') -definemultiline(F'PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') -curlyopen() -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\') -else: - write(' const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\') -asmopen() -#pg1.loadpredication() -#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") -fetch_base_ptr(F"baseU", target='A') -prefetch_L2(F"baseU", -1) -prefetch_L2(F"baseU", 0) -prefetch_L2(F"baseU", 1) -prefetch_L2(F"baseU", 2) -prefetch_L2(F"baseU", 3) -prefetch_L2(F"baseU", 4) -prefetch_L2(F"baseU", 5) -prefetch_L2(F"baseU", 6) -prefetch_L2(F"baseU", 7) -#prefetch_L2(F"baseU", 8) -asmclose() -curlyclose() -newline() - -# prefetch gauge from memory into L1 cache -d['factor'] = 0 -d['cycles_PREFETCH_L1'] += 0 * d['factor'] -write('// PREFETCH_GAUGE_L1 (prefetch to L1)') -definemultiline(F'PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)') -curlyopen() -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') -else: - write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') -asmopen() -#pg1.loadpredication() -#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") -fetch_base_ptr(F"baseU", target='A') -prefetch_L1(F"baseU", 0) -prefetch_L1(F"baseU", 1) -prefetch_L1(F"baseU", 2) -asmclose() -curlyclose() -newline() - -d['factor'] = 0 -write('// LOAD_CHI') -definemultiline(F'LOAD_CHI_{PRECSUFFIX}(base)') -if ASM_LOAD_CHIMU: - curlyopen() - #write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') - #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - fetch_base_ptr(F"base", target='I') - fetch_base_ptr(F"base", target='A') - - Chi_00.load("ref[0][0]", offset=0) - Chi_01.load("ref[0][1]", offset=0) - Chi_02.load("ref[0][2]", offset=0) - Chi_10.load("ref[1][0]", offset=0) - Chi_11.load("ref[1][1]", offset=0) - Chi_12.load("ref[1][2]", offset=0) - asmclose() - debugall('LOAD_CHI', group='Chi') - curlyclose() -newline() - - - -d['factor'] = 8 -# 12 loads = 12 issues, load latency = 8+1 cycles -# (not perfectly clear to me from docs) -d['cycles_LOAD_CHIMU'] += 11 * d['factor'] -write('// LOAD_CHIMU') -definemultiline(F'LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') -if ASM_LOAD_CHIMU: - curlyopen() - #write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - pg1.loadpredication() - #fetch_base_ptr("&ref[0][0]") - #fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") - fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') - fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - # Chimu_00.load("ref[0][0]") - # Chimu_01.load("ref[0][1]") - # Chimu_02.load("ref[0][2]") - # Chimu_10.load("ref[1][0]") - # Chimu_11.load("ref[1][1]") - # Chimu_12.load("ref[1][2]") - # Chimu_20.load("ref[2][0]") - # Chimu_21.load("ref[2][1]") - # Chimu_22.load("ref[2][2]") - # Chimu_30.load("ref[3][0]") - # Chimu_31.load("ref[3][1]") - # Chimu_32.load("ref[3][2]") - - Chimu_00.load("ref[0][0]") # minimum penalty for all directions - Chimu_30.load("ref[3][0]") - Chimu_10.load("ref[1][0]") - Chimu_20.load("ref[2][0]") - - Chimu_01.load("ref[0][1]") - Chimu_31.load("ref[3][1]") - Chimu_11.load("ref[1][1]") - Chimu_21.load("ref[2][1]") - - Chimu_02.load("ref[0][2]") - Chimu_32.load("ref[3][2]") - Chimu_12.load("ref[1][2]") - Chimu_22.load("ref[2][2]") - asmclose() - debugall('LOAD_CHIMU', group='Chimu') - curlyclose() -newline() - -# alternative load chimu: dirac order 0213 -# placed into asm (...) -d['factor'] = 0 -d['cycles_LOAD_CHIMU'] += 11 * d['factor'] -write('// LOAD_CHIMU_0213') -definemultiline(F'LOAD_CHIMU_0213_{PRECSUFFIX}') -if ASM_LOAD_CHIMU: - curlyopen() - write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - pg1.loadpredication() - fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") - Chimu_00.load("ref[0][0]") # reordered - Chimu_20.load("ref[2][0]") - - Chimu_01.load("ref[0][1]") - Chimu_21.load("ref[2][1]") - - Chimu_02.load("ref[0][2]") - Chimu_22.load("ref[2][2]") - - Chimu_10.load("ref[1][0]") - Chimu_30.load("ref[3][0]") - - Chimu_11.load("ref[1][1]") - Chimu_31.load("ref[3][1]") - - Chimu_12.load("ref[1][2]") - Chimu_32.load("ref[3][2]") - asmclose() - debugall('LOAD_CHIMU_0213', group='Chimu') - curlyclose() -newline() - -# alternative load chimu: dirac order 0312 -# placed into asm (...) -d['factor'] = 0 -d['cycles_LOAD_CHIMU'] += 11 * d['factor'] -write('// LOAD_CHIMU_0312') -definemultiline(F'LOAD_CHIMU_0312_{PRECSUFFIX}') -if ASM_LOAD_CHIMU: - curlyopen() - write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - pg1.loadpredication() - fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") - Chimu_00.load("ref[0][0]") # reordered - Chimu_30.load("ref[3][0]") - - Chimu_01.load("ref[0][1]") - Chimu_31.load("ref[3][1]") - - Chimu_02.load("ref[0][2]") - Chimu_32.load("ref[3][2]") - - Chimu_10.load("ref[1][0]") - Chimu_20.load("ref[2][0]") - - Chimu_11.load("ref[1][1]") - Chimu_21.load("ref[2][1]") - - Chimu_12.load("ref[1][2]") - Chimu_22.load("ref[2][2]") - asmclose() - debugall('LOAD_CHIMU_0312', group='Chimu') - curlyclose() -newline() - -d['factor'] = 2 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE0') -definemultiline(F'LOAD_TABLE0') -asmopen() -table0.loadtable(0) -asmclose() -newline() - -d['factor'] = 2 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE1') -definemultiline(F'LOAD_TABLE1') -asmopen() -table0.loadtable(1) -asmclose() -newline() - -d['factor'] = 2 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE2') -definemultiline(F'LOAD_TABLE2') -asmopen() -table0.loadtable(2) -asmclose() -newline() - -d['factor'] = 0 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE3') -definemultiline(F'LOAD_TABLE3') -asmopen() -table0.loadtable(3) -asmclose() -newline() - -d['factor'] = 2 # factor is 2 -d['cycles_PERM'] += 6 * d['factor'] -write('// PERMUTE') -definemultiline(F'PERMUTE_{PRECSUFFIX}') -debugall('PERM PRE', group='Chi') -asmopen() -#table0.loadtable(2) -Chi_00.permute(2, table0) -Chi_01.permute(2, table0) -Chi_02.permute(2, table0) -Chi_10.permute(2, table0) -Chi_11.permute(2, table0) -Chi_12.permute(2, table0) -asmclose() -debugall('PERM POST', group='Chi') -newline() - -write('// LOAD_GAUGE') -definemultiline(F'LOAD_GAUGE') -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') -else: - write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') -curlyopen() -asmopen() -pg1.loadpredication() -fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') -if ASM_LOAD_GAUGE: - fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - U_00.load("ref[0][0]") - U_10.load("ref[1][0]") - U_20.load("ref[2][0]") - U_01.load("ref[0][1]") - U_11.load("ref[1][1]") - U_21.load("ref[2][1]") -asmclose() -curlyclose() -newline() - -d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total -# assume all U loads are hidden -# FCMLA issue latency = 2 cycles -# measurement: latency = 16 cycles if FULLY pipelined !? -# spec says 6+6+9 cycles -# 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9 -d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor'] -write('// MULT_2SPIN') -definemultiline(F'MULT_2SPIN_1_{PRECSUFFIX}(A)') -curlyopen() -#write(' const auto & ref(U[sU][A]); \\') -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') -else: - write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') -asmopen() -#pg1.loadpredication() -#fetch_base_ptr("&ref[0][0]") -fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') -fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') -#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='I') -#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='A') -#fetch_base_ptr(F"&ref[0][{FETCH_BASE_PTR_COLOR_OFFSET}]") -if ASM_LOAD_GAUGE: - U_00.load("ref[0][0]") - U_10.load("ref[1][0]") - U_20.load("ref[2][0]") - U_01.load("ref[0][1]") - U_11.load("ref[1][1]") - U_21.load("ref[2][1]") - -if MOVPRFX == False: - UChi_00.zero() # implementation specific - UChi_10.zero() - UChi_01.zero() - UChi_11.zero() - UChi_02.zero() - UChi_12.zero() - - # round 1 - UChi_00.mul0(U_00, Chi_00) # FCMLA latency is 6+6+9 cycles - UChi_10.mul0(U_00, Chi_10) - UChi_01.mul0(U_10, Chi_00) - UChi_11.mul0(U_10, Chi_10) - UChi_02.mul0(U_20, Chi_00) - UChi_12.mul0(U_20, Chi_10) -else: - # round 1 - UChi_00.mul0(zero0, U_00, Chi_00, constructive=True) # FCMLA latency is 6+6+9 cycles - UChi_10.mul0(zero0, U_00, Chi_10, constructive=True) - UChi_01.mul0(zero0, U_10, Chi_00, constructive=True) - UChi_11.mul0(zero0, U_10, Chi_10, constructive=True) - UChi_02.mul0(zero0, U_20, Chi_00, constructive=True) - UChi_12.mul0(zero0, U_20, Chi_10, constructive=True) - -# round 2 -UChi_00.mul1(U_00, Chi_00) -UChi_10.mul1(U_00, Chi_10) -UChi_01.mul1(U_10, Chi_00) -UChi_11.mul1(U_10, Chi_10) -UChi_02.mul1(U_20, Chi_00) -UChi_12.mul1(U_20, Chi_10) # Chi_00 and Chi_10 available from here - -if ASM_LOAD_GAUGE: - U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded - U_10.load("ref[1][2]") # early load - U_20.load("ref[2][2]") # A --> -asmclose() -debugall('MULT_2SPIN_1', group='UChi') -curlyclose() -newline() - -write('// MULT_2SPIN_BACKEND') -definemultiline(F'MULT_2SPIN_2_{PRECSUFFIX}') -curlyopen() -asmopen() -# round 3 -UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and -UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90) -UChi_01.mac0(U_11, Chi_01) # autonomously using intrinsics -UChi_11.mac0(U_11, Chi_11) -UChi_02.mac0(U_21, Chi_01) -UChi_12.mac0(U_21, Chi_11) -# round 4 -UChi_00.mac1(U_01, Chi_01) -UChi_10.mac1(U_01, Chi_11) -UChi_01.mac1(U_11, Chi_01) -UChi_11.mac1(U_11, Chi_11) -UChi_02.mac1(U_21, Chi_01) -UChi_12.mac1(U_21, Chi_11) -# round 5 -UChi_00.mac0(U_00, Chi_02) # <-- A -UChi_10.mac0(U_00, Chi_12) -UChi_01.mac0(U_10, Chi_02) -UChi_11.mac0(U_10, Chi_12) -UChi_02.mac0(U_20, Chi_02) -UChi_12.mac0(U_20, Chi_12) -# round 6 -UChi_00.mac1(U_00, Chi_02) -UChi_10.mac1(U_00, Chi_12) -UChi_01.mac1(U_10, Chi_02) -UChi_11.mac1(U_10, Chi_12) -UChi_02.mac1(U_20, Chi_02) -UChi_12.mac1(U_20, Chi_12) -asmclose() -debugall('MULT_2SPIN_2', group='UChi') -curlyclose() -newline() - - -#// hspin(0)=fspin(0)+timesI(fspin(3)); -#// hspin(1)=fspin(1)+timesI(fspin(2)); -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// XP_PROJ') -definemultiline(F'XP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.addTimesI(Chimu_00, Chimu_30) -Chi_01.addTimesI(Chimu_01, Chimu_31) -Chi_02.addTimesI(Chimu_02, Chimu_32) -Chi_10.addTimesI(Chimu_10, Chimu_20) -Chi_11.addTimesI(Chimu_11, Chimu_21) -Chi_12.addTimesI(Chimu_12, Chimu_22) -asmclose() -debugall('XP_PROJ', group='Chi') -curlyclose() -newline() - -#// fspin(0)=hspin(0); -#// fspin(1)=hspin(1); -#// fspin(2)=timesMinusI(hspin(1)); -#// fspin(3)=timesMinusI(hspin(0)); -# does not occur in GridBench -d['factor'] = 0 -d['cycles_RECON'] += 15 * d['factor'] -write('// XP_RECON') -definemultiline(F'XP_RECON_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -if MOVPRFX == False: - result_20.zero() - result_21.zero() - result_22.zero() - result_30.zero() - result_31.zero() - result_32.zero() - - result_20.subTimesI(UChi_10) - result_21.subTimesI(UChi_11) - result_22.subTimesI(UChi_12) - result_30.subTimesI(UChi_00) - result_31.subTimesI(UChi_01) - result_32.subTimesI(UChi_02) -else: - result_20.subTimesI(zero0, UChi_10, constructive=True) - result_21.subTimesI(zero0, UChi_11, constructive=True) - result_22.subTimesI(zero0, UChi_12, constructive=True) - result_30.subTimesI(zero0, UChi_00, constructive=True) - result_31.subTimesI(zero0, UChi_01, constructive=True) - result_32.subTimesI(zero0, UChi_02, constructive=True) - -result_00.move(UChi_00) # don't reorder ! -result_01.move(UChi_01) -result_02.move(UChi_02) -result_10.move(UChi_10) -result_11.move(UChi_11) -result_12.move(UChi_12) - -# result_00.add(UChi_00) # faster than move? -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -asmclose() -debugall('XP_RECON', group='result') -newline() - - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_RECON'] += 15 * d['factor'] -write('// XP_RECON_ACCUM') -definemultiline(F'XP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_20.subTimesI(UChi_10) -# result_21.subTimesI(UChi_11) -# result_22.subTimesI(UChi_12) -# result_30.subTimesI(UChi_00) -# result_31.subTimesI(UChi_01) -# result_32.subTimesI(UChi_02) -# -# result_00.add(UChi_00) # reordered -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) - -result_30.subTimesI(UChi_00) # reordered -result_00.add(UChi_00) - -result_31.subTimesI(UChi_01) -result_01.add(UChi_01) - -result_32.subTimesI(UChi_02) -result_02.add(UChi_02) - -result_20.subTimesI(UChi_10) -result_10.add(UChi_10) - -result_21.subTimesI(UChi_11) -result_11.add(UChi_11) - -result_22.subTimesI(UChi_12) -result_12.add(UChi_12) -asmclose() -debugall('XP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// YP_PROJ') -definemultiline(F'YP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.sub(Chimu_00, Chimu_30) -Chi_01.sub(Chimu_01, Chimu_31) -Chi_02.sub(Chimu_02, Chimu_32) -Chi_10.add(Chimu_10, Chimu_20) -Chi_11.add(Chimu_11, Chimu_21) -Chi_12.add(Chimu_12, Chimu_22) -asmclose() -debugall('YP_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// ZP_PROJ') -definemultiline(F'ZP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.addTimesI(Chimu_00, Chimu_20) -Chi_01.addTimesI(Chimu_01, Chimu_21) -Chi_02.addTimesI(Chimu_02, Chimu_22) -Chi_10.subTimesI(Chimu_10, Chimu_30) -Chi_11.subTimesI(Chimu_11, Chimu_31) -Chi_12.subTimesI(Chimu_12, Chimu_32) -asmclose() -debugall('ZP_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// TP_PROJ') -definemultiline(F'TP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.add(Chimu_00, Chimu_20) -Chi_01.add(Chimu_01, Chimu_21) -Chi_02.add(Chimu_02, Chimu_22) -Chi_10.add(Chimu_10, Chimu_30) -Chi_11.add(Chimu_11, Chimu_31) -Chi_12.add(Chimu_12, Chimu_32) -asmclose() -debugall('TP_PROJ', group='Chi') -curlyclose() -newline() - -#// hspin(0)=fspin(0)-timesI(fspin(3)); -#// hspin(1)=fspin(1)-timesI(fspin(2)); - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// XM_PROJ') -definemultiline(F'XM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.subTimesI(Chimu_00, Chimu_30) -Chi_01.subTimesI(Chimu_01, Chimu_31) -Chi_02.subTimesI(Chimu_02, Chimu_32) -Chi_10.subTimesI(Chimu_10, Chimu_20) -Chi_11.subTimesI(Chimu_11, Chimu_21) -Chi_12.subTimesI(Chimu_12, Chimu_22) -asmclose() -debugall('XM_PROJ sub', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 15 * d['factor'] -write('// XM_RECON') -definemultiline(F'XM_RECON_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() - -# only necessary if not zeroed before -if MOVPRFX == False: - result_20.zero() - result_21.zero() - result_22.zero() - result_30.zero() - result_31.zero() - result_32.zero() - - result_20.addTimesI(UChi_10) # <-- - result_21.addTimesI(UChi_11) - result_22.addTimesI(UChi_12) - result_30.addTimesI(UChi_00) - result_31.addTimesI(UChi_01) - result_32.addTimesI(UChi_02) -else: - result_20.addTimesI(zero0, UChi_10, constructive=True) # <-- - result_21.addTimesI(zero0, UChi_11, constructive=True) - result_22.addTimesI(zero0, UChi_12, constructive=True) - result_30.addTimesI(zero0, UChi_00, constructive=True) - result_31.addTimesI(zero0, UChi_01, constructive=True) - result_32.addTimesI(zero0, UChi_02, constructive=True) - -result_00.move(UChi_00) -result_01.move(UChi_01) -result_02.move(UChi_02) -result_10.move(UChi_10) -result_11.move(UChi_11) -result_12.move(UChi_12) -asmclose() -debugall('XM_RECON result', group='result') -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// YM_PROJ') -definemultiline(F'YM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.add(Chimu_00, Chimu_30) -Chi_01.add(Chimu_01, Chimu_31) -Chi_02.add(Chimu_02, Chimu_32) -Chi_10.sub(Chimu_10, Chimu_20) -Chi_11.sub(Chimu_11, Chimu_21) -Chi_12.sub(Chimu_12, Chimu_22) -asmclose() -debugall('YM_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// ZM_PROJ') -definemultiline(F'ZM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.subTimesI(Chimu_00, Chimu_20) -Chi_01.subTimesI(Chimu_01, Chimu_21) -Chi_02.subTimesI(Chimu_02, Chimu_22) -Chi_10.addTimesI(Chimu_10, Chimu_30) -Chi_11.addTimesI(Chimu_11, Chimu_31) -Chi_12.addTimesI(Chimu_12, Chimu_32) -asmclose() -debugall('ZM_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// TM_PROJ') -definemultiline(F'TM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -pg1.loadpredication() -Chi_00.sub(Chimu_00, Chimu_20) -Chi_01.sub(Chimu_01, Chimu_21) -Chi_02.sub(Chimu_02, Chimu_22) -Chi_10.sub(Chimu_10, Chimu_30) -Chi_11.sub(Chimu_11, Chimu_31) -Chi_12.sub(Chimu_12, Chimu_32) -asmclose() -debugall('TM_PROJ', group='Chi') -curlyclose() -newline() - -# does not occur in GridBench -d['factor'] = 0 -# add/sub issue latency = 1, latency is 9 -d['cycles_RECON'] += 15 * d['factor'] -write('// XM_RECON_ACCUM') -definemultiline(F'XM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -# result_20.addTimesI(UChi_10) -# result_21.addTimesI(UChi_11) -# result_22.addTimesI(UChi_12) -# result_30.addTimesI(UChi_00) -# result_31.addTimesI(UChi_01) -# result_32.addTimesI(UChi_02) -# -# # result_00.move(UChi_00) -# # result_01.move(UChi_01) -# # result_02.move(UChi_02) -# # result_10.move(UChi_10) -# # result_11.move(UChi_11) -# # result_12.move(UChi_12) -# -# # faster than move ? -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) - -result_30.addTimesI(UChi_00) # reordered -result_31.addTimesI(UChi_01) -result_32.addTimesI(UChi_02) - -result_20.addTimesI(UChi_10) -result_21.addTimesI(UChi_11) -result_22.addTimesI(UChi_12) - -result_00.add(UChi_00) -result_01.add(UChi_01) -result_02.add(UChi_02) -result_10.add(UChi_10) -result_11.add(UChi_11) -result_12.add(UChi_12) -asmclose() -debugall('XM_RECON_ACCUM', group='result') -newline() - - - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// YP_RECON_ACCUM') -definemultiline(F'YP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.add(UChi_10) -# result_21.add(UChi_11) -# result_22.add(UChi_12) -# result_30.sub(UChi_00) -# result_31.sub(UChi_01) -# result_32.sub(UChi_02) - -result_00.add(UChi_00) # reordered -result_30.sub(UChi_00) - -result_01.add(UChi_01) -result_31.sub(UChi_01) - -result_02.add(UChi_02) -result_32.sub(UChi_02) - -result_10.add(UChi_10) -result_20.add(UChi_10) - -result_11.add(UChi_11) -result_21.add(UChi_11) - -result_12.add(UChi_12) -result_22.add(UChi_12) -asmclose() -debugall('YP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// YM_RECON_ACCUM') -definemultiline(F'YM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.sub(UChi_10) -# result_21.sub(UChi_11) -# result_22.sub(UChi_12) -# result_30.add(UChi_00) -# result_31.add(UChi_01) -# result_32.add(UChi_02) - -result_00.add(UChi_00) # reordered -result_30.add(UChi_00) - -result_01.add(UChi_01) -result_31.add(UChi_01) - -result_02.add(UChi_02) -result_32.add(UChi_02) - -result_10.add(UChi_10) -result_20.sub(UChi_10) - -result_11.add(UChi_11) -result_21.sub(UChi_11) - -result_12.add(UChi_12) -result_22.sub(UChi_12) -asmclose() -debugall('YM_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 15 * d['factor'] -write('// ZP_RECON_ACCUM') -definemultiline(F'ZP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_20.subTimesI(UChi_00) -# result_21.subTimesI(UChi_01) -# result_22.subTimesI(UChi_02) -# result_30.addTimesI(UChi_10) -# result_31.addTimesI(UChi_11) -# result_32.addTimesI(UChi_12) -# -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -result_20.subTimesI(UChi_00) # reordered -result_00.add(UChi_00) - -result_21.subTimesI(UChi_01) -result_01.add(UChi_01) - -result_22.subTimesI(UChi_02) -result_02.add(UChi_02) - -result_30.addTimesI(UChi_10) -result_10.add(UChi_10) - -result_31.addTimesI(UChi_11) -result_11.add(UChi_11) - -result_32.addTimesI(UChi_12) -result_12.add(UChi_12) -asmclose() -debugall('ZP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 15 * d['factor'] -write('// ZM_RECON_ACCUM') -definemultiline(F'ZM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_20.addTimesI(UChi_00) -# result_21.addTimesI(UChi_01) -# result_22.addTimesI(UChi_02) -# result_30.subTimesI(UChi_10) -# result_31.subTimesI(UChi_11) -# result_32.subTimesI(UChi_12) -# -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -result_20.addTimesI(UChi_00) # reordered -result_00.add(UChi_00) - -result_21.addTimesI(UChi_01) -result_01.add(UChi_01) - -result_22.addTimesI(UChi_02) -result_02.add(UChi_02) - -result_30.subTimesI(UChi_10) -result_10.add(UChi_10) - -result_31.subTimesI(UChi_11) -result_11.add(UChi_11) - -result_32.subTimesI(UChi_12) -result_12.add(UChi_12) -asmclose() -debugall('ZM_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// TP_RECON_ACCUM') -definemultiline(F'TP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.add(UChi_00) -# result_21.add(UChi_01) -# result_22.add(UChi_02) -# result_30.add(UChi_10) -# result_31.add(UChi_11) -# result_32.add(UChi_12) - -result_00.add(UChi_00) # reordered -result_20.add(UChi_00) - -result_01.add(UChi_01) -result_21.add(UChi_01) - -result_02.add(UChi_02) -result_22.add(UChi_02) - -result_10.add(UChi_10) -result_30.add(UChi_10) - -result_11.add(UChi_11) -result_31.add(UChi_11) - -result_12.add(UChi_12) -result_32.add(UChi_12) -asmclose() -debugall('TP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// TM_RECON_ACCUM') -definemultiline(F'TM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.sub(UChi_00) -# result_21.sub(UChi_01) -# result_22.sub(UChi_02) -# result_30.sub(UChi_10) -# result_31.sub(UChi_11) -# result_32.sub(UChi_12) - -result_00.add(UChi_00) # reordered -result_20.sub(UChi_00) - -result_01.add(UChi_01) -result_21.sub(UChi_01) - -result_02.add(UChi_02) -result_22.sub(UChi_02) - -result_10.add(UChi_10) -result_30.sub(UChi_10) - -result_11.add(UChi_11) -result_31.sub(UChi_11) - -result_12.add(UChi_12) -result_32.sub(UChi_12) -asmclose() -debugall('TM_RECON_ACCUM', group='result') -newline() - -d['factor'] = 0 -# have 12 instructions -# picking dual issue versions -d['cycles_ZERO_PSI'] += 6 * d['factor'] -write('// ZERO_PSI') -definemultiline(F'ZERO_PSI_{PRECSUFFIX}') -asmopen() -pg1.loadpredication() -result_00.zero() -result_01.zero() -result_02.zero() -result_10.zero() -result_11.zero() -result_12.zero() -result_20.zero() -result_21.zero() -result_22.zero() -result_30.zero() -result_31.zero() -result_32.zero() -asmclose() -#debugall('ZERO_PSI', group='result') -newline() - -# prefetch store spinors to L2 cache -d['factor'] = 0 -d['cycles_PREFETCH_L2'] += 0 * d['factor'] -write('// PREFETCH_RESULT_L2_STORE (prefetch store to L2)') -definemultiline(F'PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -fetch_base_ptr(F"base", target='A') -prefetch_L2_store(F"base", 0) -prefetch_L2_store(F"base", 1) -prefetch_L2_store(F"base", 2) -asmclose() -curlyclose() -newline() - -# prefetch store spinors to L1 cache -d['factor'] = 0 -d['cycles_PREFETCH_L1'] += 0 * d['factor'] -write('// PREFETCH_RESULT_L1_STORE (prefetch store to L1)') -definemultiline(F'PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -fetch_base_ptr(F"base", target='A') -prefetch_L1_store(F"base", 0) -prefetch_L1_store(F"base", 1) -prefetch_L1_store(F"base", 2) -asmclose() -curlyclose() -newline() - - -d['factor'] = 0 -write('// ADD_RESULT_INTERNAL') -definemultiline(F'ADD_RESULT_INTERNAL_{PRECSUFFIX}') -asmopen() -result_00.add(Chimu_00) -result_01.add(Chimu_01) -result_02.add(Chimu_02) -result_10.add(Chimu_10) -result_11.add(Chimu_11) -result_12.add(Chimu_12) -result_20.add(Chimu_20) -result_21.add(Chimu_21) -result_22.add(Chimu_22) -result_30.add(Chimu_30) -result_31.add(Chimu_31) -result_32.add(Chimu_32) -asmclose() -#debugall('ZERO_PSI', group='result') -newline() - -# -------------------------------------------------------------------------------- - -# C -f = open('w.h', 'w') -f.write(d['C']) -f.close() - -# intrin -f = open('wi.h', 'w') -f.write(d['I']) -f.close() - -filename = '' -if PRECISION == 'double': - filename = "Fujitsu_A64FX_intrin_double.h" -else: - filename = "Fujitsu_A64FX_intrin_single.h" -f = open(filename, 'w') -f.write(LEGAL.format(filename)) -f.write(d['I']) -f.close() - - -# asm -f = open('wa.h', 'w') -f.write(d['A']) -f.close() - -filename = '' -if PRECISION == 'double': - filename = "Fujitsu_A64FX_asm_double.h" -else: - filename = "Fujitsu_A64FX_asm_single.h" -f = open(filename, 'w') -f.write(LEGAL.format(filename)) -f.write(d['A']) -f.close() - - -# arithmetics instruction count, mul/mac = 2 instructions each -d['acount'] = d['add'] + d['sub'] + \ - d['mul'] + d['mac'] + d['addTimesI'] + d['subTimesI'] - -# permutations -d['permutes'] += 2*d['timesI'] + 1*d['timesMinusI'] -d['neg'] = 1*d['timesI'] + 1*d['timesMinusI'] - -# instruction count, mul/mac = 2 instructions each, +/- *i = 3 instructions each -d['icount'] = d['load'] + d['store'] + d['move'] + d['add'] + d['sub'] + \ - d['mul'] + d['mac'] + d['permutes'] + d['neg'] + \ - d['addTimesI'] + d['subTimesI'] + d['zero'] + d['movprfx'] - -# flops -d['flops'] = 4*d['mac'] + 3*d['mul'] + d['add'] + d['sub'] + \ - d['addTimesI'] + d['subTimesI'] - - - - - -print('Statistics') -print('') -print('Type Occurences Total / Arith instructions') -print('-------------------------------------------------------------------') -print('Variables {:4d}'.format(d['registers'])) -print('') -print('load {:4d}'.format(d['load'])) -print('store {:4d}'.format(d['store'])) -print('move {:4d}'.format(d['move'])) -print('movprfx {:4d}'.format(d['movprfx'])) -print('zero {:4d}'.format(d['zero'])) -print('negate {:4d}'.format(d['neg'])) - - -print('add {:4d} {:0.2f} / {:0.2f}'.\ - format(d['add'], d['add'] / d['icount'], d['add'] / d['acount'])) -print('sub {:4d} {:0.2f} / {:0.2f}'.\ - format(d['sub'], d['sub'] / d['icount'], d['sub'] / d['acount'])) -print('mul {:4d} {:0.2f} / {:0.2f}'.\ - format(d['mul'], 2*d['mul'] / d['icount'], 2*d['mul'] / d['acount'])) -print('mac {:4d} {:0.2f} / {:0.2f}'.\ - format(d['mac'], 2*d['mac'] / d['icount'], 2*d['mac'] / d['acount'])) -print('addTimesI {:4d} {:0.2f} / {:0.2f}'.\ - format(d['addTimesI'], 2*d['addTimesI'] / d['icount'], 2*d['addTimesI'] / d['acount'])) -print('subTimesI {:4d} {:0.2f} / {:0.2f}'.\ - format(d['subTimesI'], 2*d['subTimesI'] / d['icount'], 2*d['subTimesI'] / d['acount'])) - -print('timesI {:4d}'.format(d['timesI'])) -print('timesMinusI {:4d}'.format(d['timesMinusI'])) -print('permutes {:4d} {:0.2f}'.\ - format(d['permutes'], d['permutes'] / d['icount'])) -print('') -print('flops {:4d}'.format(d['flops'])) -print('instruction count {:4d}'.format(d['icount'])) -print('arith. instruction count {:4d} {:0.2f}'.\ - format(d['acount'], d['acount'] / d['icount'])) - - -# ---- static pipeline resources consumption ---- -FLA = 0 -FLA += 2 * d['mac'] + 2 * d['mul'] -FLA += 1 * d['addTimesI'] + 1 * d['subTimesI'] -FLA += 1 * d['move'] -FLA += 1 * d['permutes'] -FLA += 1 * d['store'] -FLA += 1 * d['zero'] - -FLB = 0 -FLB += 1 * d['addTimesI'] + 1 * d['subTimesI'] - -FLAB = 0 -FLAB += 1 * d['mac'] + 1 * d['mul'] -FLAB += 1 * d['add'] + 1 * d['sub'] -FLAB += 1 * d['neg'] + 1 * d['movprfx'] -#FLAB += 1 * d['zero'] - - -FL_slots = 2 * d['icount'] -FL_micro_ops = FLA + FLB + FLAB - -print('') -print('------------------------------------------------------------------') -print('') -print('Static FL slot usage') -print('') -print(' FLA {:4d}'.format(FLA)) -print(' FLB {:4d}'.format(FLB)) -print(' FLA/B {:4d}'.format(FLAB)) - -print('') -print('Static FL slot efficiency') -print('') -print(' Total FL slots {:4d}'.format(FL_slots)) -print(' FL slots occupied {:4d}'.format(FL_micro_ops)) -print(' FL slot efficiency {:0.2f}'.format(FL_micro_ops / FL_slots)) - -cycles_total = d['cycles_ZERO_PSI'] + d['cycles_LOAD_CHIMU'] + \ - d['cycles_PROJ'] + d['cycles_PERM'] + d['cycles_MULT_2SPIN'] + \ - d['cycles_RECON'] + d['cycles_RESULT'] -cycles_total_hidden = d['cycles_ZERO_PSI'] + \ - d['cycles_PROJ'] + d['cycles_MULT_2SPIN'] + \ - d['cycles_RECON'] - -# ---- dynamic estimate ---- - -print('') -print('Dynamic cycles estimate (incl. latencies)') -print('') -print(' ZERO_PSI {:4d}'.format(d['cycles_ZERO_PSI'])) -print(' LOAD_CHIMU {:4d}'.format(d['cycles_LOAD_CHIMU'])) -print(' PROJ {:4d}'.format(d['cycles_PROJ'])) -print(' PERM {:4d}'.format(d['cycles_PERM'])) -print(' MULT_2SPIN {:4d}'.format(d['cycles_MULT_2SPIN'])) -print(' RECON {:4d}'.format(d['cycles_RECON'])) -print(' STORE {:4d}'.format(d['cycles_RESULT'])) -print('') -print(' Sum {:4d}'.format(cycles_total)) -print('') -print(' Sum* {:4d}'.format(cycles_total_hidden)) -print(' Total FL slots* {:4d}'.format(cycles_total_hidden * 2)) -print(' FL slots occupied* {:4d}'.format(FL_micro_ops)) -print(' FL slot efficiency* {:0.2f}'.format(FL_micro_ops / (2*cycles_total_hidden))) -print('') -print(' *load/store/PERM hidden') - -estimated_cycles = cycles_total_hidden -# Estimate percent peak DP; dual issue, fma -pp = 100 * 4 * d['flops'] / (2*2*8*estimated_cycles) -print('') -print('Model prediction') -print('') -print(' Cycles* {:4d}'.format(estimated_cycles)) -print(' Percent peak* {:4.1f} %'.format(pp)) - -# estimated RF throughput in GB/s @ 2.2 GHz -tp10 = (d['load'] + d['store']) * 64 * 2.2 / estimated_cycles -tp2 = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / estimated_cycles -print('') -print(' Estimated RF throughput* {:4.1f} GB/s'.\ - format(tp10)) -print(' Estimated RF throughput* {:4.1f} GiB/s'.\ - format(tp2)) - -# ---- dynamic pipeline resources consumption ---- - -runtime = measured_cycles # runtime in cycles -pp_runtime = 100 * 4 * d['flops'] / (2*2*8*runtime) -runtime_FL_slots = 2 * runtime -delta = runtime - estimated_cycles - - -print('') -print('------------------------------------------------------------------') -print('') -print('Dynamic runtime analysis (cycles from measurements)') -print('') -print(' Cycles {:4d}'.format(runtime)) -print(' Percent peak {:4.1f} %'.format(pp_runtime)) -print(' Deviation from estimate {:4d} {:4.2f} %'.\ - format(delta, 100. * abs(delta/runtime))) -print(' Deviation per direction {:4.1f}'.format(delta/8)) - -# estimated RF throughput in GB/s @ 2.2 GHz -tp10_rt = (d['load'] + d['store']) * 64 * 2.2 / runtime -tp2_rt = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / runtime -print('') -print(' RF throughput {:4.1f} GB/s'.\ - format(tp10_rt)) -print(' RF throughput {:4.1f} GiB/s'.\ - format(tp2_rt)) -print('') -print(' Total FL slots {:4d}'.format(runtime_FL_slots)) -print(' FL slots occupied {:4d}'.format(FL_micro_ops)) -print(' FL slot efficiency {:0.2f}'.format(FL_micro_ops / runtime_FL_slots)) -print('') From 909acd55cd36c4b567cab30d311aab6b8674288d Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 02:00:22 +0100 Subject: [PATCH 093/201] vnum variant for prefetches --- Grid/simd/Fujitsu_A64FX_intrin_double.h | 36 ++++++++++++------------- Grid/simd/Fujitsu_A64FX_intrin_single.h | 36 ++++++++++++------------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 361246fc..f195e3c5 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -144,38 +144,38 @@ Author: Nils Meyer // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL2STRM); \ } // PREFETCH_CHIMU_L1 (prefetch to L1) #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL1STRM); \ } // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ - svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)-4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)12), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)16), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)20), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)24), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)28), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL1STRM); \ } // LOAD_CHI #define LOAD_CHI_A64FXd(base) \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 30273b6e..0b874f02 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -144,38 +144,38 @@ Author: Nils Meyer // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \ } // PREFETCH_CHIMU_L1 (prefetch to L1) #define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \ } // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ - svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \ } // LOAD_CHI #define LOAD_CHI_A64FXf(base) \ From 4b882e8056b2c9dd6dceab2729104e5e615835ae Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 03:09:20 +0100 Subject: [PATCH 094/201] fixed lost bracket --- Grid/simd/Fujitsu_A64FX_intrin_double.h | 36 ++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index f195e3c5..b645c365 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -144,38 +144,38 @@ Author: Nils Meyer // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ { \ - svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \ } // PREFETCH_CHIMU_L1 (prefetch to L1) #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ { \ - svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL1STRM); \ - svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL1STRM); \ - svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \ } // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)-4), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)12), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)16), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)20), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)24), SV_PLDL2STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)28), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL1STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL1STRM); \ - svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \ } // LOAD_CHI #define LOAD_CHI_A64FXd(base) \ From 6013183361d88fe7179b4fcf6b8321c0621b09ba Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 03:25:01 +0100 Subject: [PATCH 095/201] removed Asm impls --- Grid/simd/Fujitsu_A64FX_asm_double.h | 781 --------------------------- Grid/simd/Fujitsu_A64FX_asm_single.h | 781 --------------------------- 2 files changed, 1562 deletions(-) delete mode 100644 Grid/simd/Fujitsu_A64FX_asm_double.h delete mode 100644 Grid/simd/Fujitsu_A64FX_asm_single.h diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h deleted file mode 100644 index bbc4efe7..00000000 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ /dev/null @@ -1,781 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: Fujitsu_A64FX_asm_double.h - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base) -#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) -#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) -#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) -#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) -#define PF_GAUGE(A) -#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A) -#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A) -#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define LOCK_GAUGE(A) -#define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXd -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); -#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) -#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd -#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) -#define ZERO_PSI ZERO_PSI_A64FXd -#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) -#define XP_PROJ XP_PROJ_A64FXd -#define YP_PROJ YP_PROJ_A64FXd -#define ZP_PROJ ZP_PROJ_A64FXd -#define TP_PROJ TP_PROJ_A64FXd -#define XM_PROJ XM_PROJ_A64FXd -#define YM_PROJ YM_PROJ_A64FXd -#define ZM_PROJ ZM_PROJ_A64FXd -#define TM_PROJ TM_PROJ_A64FXd -#define XP_RECON XP_RECON_A64FXd -#define XM_RECON XM_RECON_A64FXd -#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd -#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd -#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd -#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd -#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd -#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd -#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd -#define PERMUTE_DIR0 0 -#define PERMUTE_DIR1 1 -#define PERMUTE_DIR2 2 -#define PERMUTE_DIR3 3 -#define PERMUTE PERMUTE_A64FXd; -#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; } -#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } -// DECLARATIONS -#define DECLARATIONS_A64FXd \ - uint64_t baseU; \ - const uint64_t lut[4][8] = { \ - {4, 5, 6, 7, 0, 1, 2, 3}, \ - {2, 3, 0, 1, 6, 7, 4, 5}, \ - {1, 0, 3, 2, 5, 4, 7, 6}, \ - {0, 1, 2, 4, 5, 6, 7, 8} };\ -asm ( \ - "ptrue p5.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -asm ( \ - "fmov z31.d , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// RESULT -#define RESULT_A64FXd(base) \ -{ \ -asm ( \ - "str z0, [%[storeptr], -6, mul vl] \n\t" \ - "str z1, [%[storeptr], -5, mul vl] \n\t" \ - "str z2, [%[storeptr], -4, mul vl] \n\t" \ - "str z3, [%[storeptr], -3, mul vl] \n\t" \ - "str z4, [%[storeptr], -2, mul vl] \n\t" \ - "str z5, [%[storeptr], -1, mul vl] \n\t" \ - "str z6, [%[storeptr], 0, mul vl] \n\t" \ - "str z7, [%[storeptr], 1, mul vl] \n\t" \ - "str z8, [%[storeptr], 2, mul vl] \n\t" \ - "str z9, [%[storeptr], 3, mul vl] \n\t" \ - "str z10, [%[storeptr], 4, mul vl] \n\t" \ - "str z11, [%[storeptr], 5, mul vl] \n\t" \ - : \ - : [storeptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L2 (prefetch to L2) -#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L1 (prefetch to L1) -#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L2 (prefetch to L2) -#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ -{ \ - const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHI -#define LOAD_CHI_A64FXd(base) \ -{ \ -asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU -#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ -{ \ -asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0213 -#define LOAD_CHIMU_0213_A64FXd \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0312 -#define LOAD_CHIMU_0312_A64FXd \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ld1d { z12.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z21.d }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1d { z13.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z22.d }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1d { z14.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z23.d }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1d { z15.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z18.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z16.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z19.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1d { z17.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z20.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_TABLE0 -#define LOAD_TABLE0 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE1 -#define LOAD_TABLE1 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE2 -#define LOAD_TABLE2 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE3 -#define LOAD_TABLE3 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (3) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERMUTE -#define PERMUTE_A64FXd \ -asm ( \ - "tbl z12.d, { z12.d }, z30.d \n\t" \ - "tbl z13.d, { z13.d }, z30.d \n\t" \ - "tbl z14.d, { z14.d }, z30.d \n\t" \ - "tbl z15.d, { z15.d }, z30.d \n\t" \ - "tbl z16.d, { z16.d }, z30.d \n\t" \ - "tbl z17.d, { z17.d }, z30.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_GAUGE -#define LOAD_GAUGE(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN -#define MULT_2SPIN_1_A64FXd(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "ld1d { z24.d }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1d { z27.d }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1d { z28.d }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1d { z29.d }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "movprfx z18.d, p5/m, z31.d \n\t" \ - "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ - "movprfx z21.d, p5/m, z31.d \n\t" \ - "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \ - "movprfx z19.d, p5/m, z31.d \n\t" \ - "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \ - "movprfx z22.d, p5/m, z31.d \n\t" \ - "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \ - "movprfx z20.d, p5/m, z31.d \n\t" \ - "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \ - "movprfx z23.d, p5/m, z31.d \n\t" \ - "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ - "ld1d { z24.d }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1d { z25.d }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1d { z26.d }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN_BACKEND -#define MULT_2SPIN_2_A64FXd \ -{ \ -asm ( \ - "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ - "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ - "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ - "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \ - "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \ - "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_PROJ -#define XP_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z18.d, 90 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z19.d, 90 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z20.d, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_RECON -#define XP_RECON_A64FXd \ -asm ( \ - "movprfx z6.d, p5/m, z31.d \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ - "movprfx z7.d, p5/m, z31.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ - "movprfx z8.d, p5/m, z31.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ - "movprfx z9.d, p5/m, z31.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ - "movprfx z10.d, p5/m, z31.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ - "movprfx z11.d, p5/m, z31.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ - "mov z0.d, p5/m, z18.d \n\t" \ - "mov z1.d, p5/m, z19.d \n\t" \ - "mov z2.d, p5/m, z20.d \n\t" \ - "mov z3.d, p5/m, z21.d \n\t" \ - "mov z4.d, p5/m, z22.d \n\t" \ - "mov z5.d, p5/m, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// XP_RECON_ACCUM -#define XP_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_PROJ -#define YP_PROJ_A64FXd \ -{ \ -asm ( \ - "fsub z12.d, p5/m, z12.d, z21.d \n\t" \ - "fsub z13.d, p5/m, z13.d, z22.d \n\t" \ - "fsub z14.d, p5/m, z14.d, z23.d \n\t" \ - "fadd z15.d, p5/m, z15.d, z18.d \n\t" \ - "fadd z16.d, p5/m, z16.d, z19.d \n\t" \ - "fadd z17.d, p5/m, z17.d, z20.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZP_PROJ -#define ZP_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z18.d, 90 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z19.d, 90 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z20.d, 90 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z21.d, 270 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z22.d, 270 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z23.d, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TP_PROJ -#define TP_PROJ_A64FXd \ -{ \ -asm ( \ - "fadd z12.d, p5/m, z12.d, z18.d \n\t" \ - "fadd z13.d, p5/m, z13.d, z19.d \n\t" \ - "fadd z14.d, p5/m, z14.d, z20.d \n\t" \ - "fadd z15.d, p5/m, z15.d, z21.d \n\t" \ - "fadd z16.d, p5/m, z16.d, z22.d \n\t" \ - "fadd z17.d, p5/m, z17.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_PROJ -#define XM_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z21.d, 270 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z22.d, 270 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z23.d, 270 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z18.d, 270 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z19.d, 270 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z20.d, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON -#define XM_RECON_A64FXd \ -asm ( \ - "movprfx z6.d, p5/m, z31.d \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ - "movprfx z7.d, p5/m, z31.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ - "movprfx z8.d, p5/m, z31.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ - "movprfx z9.d, p5/m, z31.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ - "movprfx z10.d, p5/m, z31.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ - "movprfx z11.d, p5/m, z31.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ - "mov z0.d, p5/m, z18.d \n\t" \ - "mov z1.d, p5/m, z19.d \n\t" \ - "mov z2.d, p5/m, z20.d \n\t" \ - "mov z3.d, p5/m, z21.d \n\t" \ - "mov z4.d, p5/m, z22.d \n\t" \ - "mov z5.d, p5/m, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_PROJ -#define YM_PROJ_A64FXd \ -{ \ -asm ( \ - "fadd z12.d, p5/m, z12.d, z21.d \n\t" \ - "fadd z13.d, p5/m, z13.d, z22.d \n\t" \ - "fadd z14.d, p5/m, z14.d, z23.d \n\t" \ - "fsub z15.d, p5/m, z15.d, z18.d \n\t" \ - "fsub z16.d, p5/m, z16.d, z19.d \n\t" \ - "fsub z17.d, p5/m, z17.d, z20.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZM_PROJ -#define ZM_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z18.d, 270 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z19.d, 270 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z20.d, 270 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z21.d, 90 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z22.d, 90 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z23.d, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TM_PROJ -#define TM_PROJ_A64FXd \ -{ \ -asm ( \ - "fsub z12.d, p5/m, z12.d, z18.d \n\t" \ - "fsub z13.d, p5/m, z13.d, z19.d \n\t" \ - "fsub z14.d, p5/m, z14.d, z20.d \n\t" \ - "fsub z15.d, p5/m, z15.d, z21.d \n\t" \ - "fsub z16.d, p5/m, z16.d, z22.d \n\t" \ - "fsub z17.d, p5/m, z17.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON_ACCUM -#define XM_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_RECON_ACCUM -#define YP_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fsub z9.d, p5/m, z9.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fsub z10.d, p5/m, z10.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fsub z11.d, p5/m, z11.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_RECON_ACCUM -#define YM_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fsub z6.d, p5/m, z6.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fsub z7.d, p5/m, z7.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fsub z8.d, p5/m, z8.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZP_RECON_ACCUM -#define ZP_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZM_RECON_ACCUM -#define ZM_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TP_RECON_ACCUM -#define TP_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TM_RECON_ACCUM -#define TM_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fsub z6.d, p5/m, z6.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fsub z7.d, p5/m, z7.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fsub z8.d, p5/m, z8.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fsub z9.d, p5/m, z9.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fsub z10.d, p5/m, z10.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fsub z11.d, p5/m, z11.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZERO_PSI -#define ZERO_PSI_A64FXd \ -asm ( \ - "fmov z0.d , 0 \n\t" \ - "fmov z1.d , 0 \n\t" \ - "fmov z2.d , 0 \n\t" \ - "fmov z3.d , 0 \n\t" \ - "fmov z4.d , 0 \n\t" \ - "fmov z5.d , 0 \n\t" \ - "fmov z6.d , 0 \n\t" \ - "fmov z7.d , 0 \n\t" \ - "fmov z8.d , 0 \n\t" \ - "fmov z9.d , 0 \n\t" \ - "fmov z10.d , 0 \n\t" \ - "fmov z11.d , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) -#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "dc zva, %[fetchptr]\n\t" \ - "dc zva, %[fetchptr]\n\t" \ - "dc zva, %[fetchptr]\n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_RESULT_L1_STORE (prefetch store to L1) -#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// ADD_RESULT_INTERNAL -#define ADD_RESULT_INTERNAL_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z12.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z13.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z14.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z15.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z16.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z17.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h deleted file mode 100644 index e629f617..00000000 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ /dev/null @@ -1,781 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: Fujitsu_A64FX_asm_single.h - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base) -#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) -#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) -#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) -#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) -#define PF_GAUGE(A) -#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A) -#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A) -#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define LOCK_GAUGE(A) -#define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXf -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); -#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) -#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf -#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) -#define ZERO_PSI ZERO_PSI_A64FXf -#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) -#define XP_PROJ XP_PROJ_A64FXf -#define YP_PROJ YP_PROJ_A64FXf -#define ZP_PROJ ZP_PROJ_A64FXf -#define TP_PROJ TP_PROJ_A64FXf -#define XM_PROJ XM_PROJ_A64FXf -#define YM_PROJ YM_PROJ_A64FXf -#define ZM_PROJ ZM_PROJ_A64FXf -#define TM_PROJ TM_PROJ_A64FXf -#define XP_RECON XP_RECON_A64FXf -#define XM_RECON XM_RECON_A64FXf -#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf -#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf -#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf -#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf -#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf -#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf -#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf -#define PERMUTE_DIR0 0 -#define PERMUTE_DIR1 1 -#define PERMUTE_DIR2 2 -#define PERMUTE_DIR3 3 -#define PERMUTE PERMUTE_A64FXf; -#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; } -#define MAYBEPERM(A,perm) if (perm) { PERMUTE; } -// DECLARATIONS -#define DECLARATIONS_A64FXf \ - uint64_t baseU; \ - const uint32_t lut[4][16] = { \ - {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ - {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ - {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ - {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ -asm ( \ - "ptrue p5.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -asm ( \ - "fmov z31.s , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// RESULT -#define RESULT_A64FXf(base) \ -{ \ -asm ( \ - "str z0, [%[storeptr], -6, mul vl] \n\t" \ - "str z1, [%[storeptr], -5, mul vl] \n\t" \ - "str z2, [%[storeptr], -4, mul vl] \n\t" \ - "str z3, [%[storeptr], -3, mul vl] \n\t" \ - "str z4, [%[storeptr], -2, mul vl] \n\t" \ - "str z5, [%[storeptr], -1, mul vl] \n\t" \ - "str z6, [%[storeptr], 0, mul vl] \n\t" \ - "str z7, [%[storeptr], 1, mul vl] \n\t" \ - "str z8, [%[storeptr], 2, mul vl] \n\t" \ - "str z9, [%[storeptr], 3, mul vl] \n\t" \ - "str z10, [%[storeptr], 4, mul vl] \n\t" \ - "str z11, [%[storeptr], 5, mul vl] \n\t" \ - : \ - : [storeptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L2 (prefetch to L2) -#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L1 (prefetch to L1) -#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L2 (prefetch to L2) -#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ -{ \ - const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHI -#define LOAD_CHI_A64FXf(base) \ -{ \ -asm ( \ - "ld1w { z12.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU -#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ -{ \ -asm ( \ - "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0213 -#define LOAD_CHIMU_0213_A64FXf \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0312 -#define LOAD_CHIMU_0312_A64FXf \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ld1w { z12.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z21.s }, p5/z, [%[fetchptr], 3, mul vl] \n\t" \ - "ld1w { z13.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z22.s }, p5/z, [%[fetchptr], 4, mul vl] \n\t" \ - "ld1w { z14.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z23.s }, p5/z, [%[fetchptr], 5, mul vl] \n\t" \ - "ld1w { z15.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z18.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z16.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z19.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "ld1w { z17.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z20.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_TABLE0 -#define LOAD_TABLE0 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE1 -#define LOAD_TABLE1 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE2 -#define LOAD_TABLE2 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE3 -#define LOAD_TABLE3 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (3) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERMUTE -#define PERMUTE_A64FXf \ -asm ( \ - "tbl z12.s, { z12.s }, z30.s \n\t" \ - "tbl z13.s, { z13.s }, z30.s \n\t" \ - "tbl z14.s, { z14.s }, z30.s \n\t" \ - "tbl z15.s, { z15.s }, z30.s \n\t" \ - "tbl z16.s, { z16.s }, z30.s \n\t" \ - "tbl z17.s, { z17.s }, z30.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_GAUGE -#define LOAD_GAUGE(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN -#define MULT_2SPIN_1_A64FXf(A) \ -{ \ - const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ -asm ( \ - "ld1w { z24.s }, p5/z, [%[fetchptr], -6, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], -3, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 0, mul vl] \n\t" \ - "ld1w { z27.s }, p5/z, [%[fetchptr], -5, mul vl] \n\t" \ - "ld1w { z28.s }, p5/z, [%[fetchptr], -2, mul vl] \n\t" \ - "ld1w { z29.s }, p5/z, [%[fetchptr], 1, mul vl] \n\t" \ - "movprfx z18.s, p5/m, z31.s \n\t" \ - "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \ - "movprfx z21.s, p5/m, z31.s \n\t" \ - "fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \ - "movprfx z19.s, p5/m, z31.s \n\t" \ - "fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \ - "movprfx z22.s, p5/m, z31.s \n\t" \ - "fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \ - "movprfx z20.s, p5/m, z31.s \n\t" \ - "fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \ - "movprfx z23.s, p5/m, z31.s \n\t" \ - "fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \ - "fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \ - "fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \ - "fcmla z19.s, p5/m, z25.s, z12.s, 90 \n\t" \ - "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \ - "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \ - "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \ - "ld1w { z24.s }, p5/z, [%[fetchptr], -4, mul vl] \n\t" \ - "ld1w { z25.s }, p5/z, [%[fetchptr], -1, mul vl] \n\t" \ - "ld1w { z26.s }, p5/z, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN_BACKEND -#define MULT_2SPIN_2_A64FXf \ -{ \ -asm ( \ - "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \ - "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \ - "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \ - "fcmla z22.s, p5/m, z28.s, z16.s, 0 \n\t" \ - "fcmla z20.s, p5/m, z29.s, z13.s, 0 \n\t" \ - "fcmla z23.s, p5/m, z29.s, z16.s, 0 \n\t" \ - "fcmla z18.s, p5/m, z27.s, z13.s, 90 \n\t" \ - "fcmla z21.s, p5/m, z27.s, z16.s, 90 \n\t" \ - "fcmla z19.s, p5/m, z28.s, z13.s, 90 \n\t" \ - "fcmla z22.s, p5/m, z28.s, z16.s, 90 \n\t" \ - "fcmla z20.s, p5/m, z29.s, z13.s, 90 \n\t" \ - "fcmla z23.s, p5/m, z29.s, z16.s, 90 \n\t" \ - "fcmla z18.s, p5/m, z24.s, z14.s, 0 \n\t" \ - "fcmla z21.s, p5/m, z24.s, z17.s, 0 \n\t" \ - "fcmla z19.s, p5/m, z25.s, z14.s, 0 \n\t" \ - "fcmla z22.s, p5/m, z25.s, z17.s, 0 \n\t" \ - "fcmla z20.s, p5/m, z26.s, z14.s, 0 \n\t" \ - "fcmla z23.s, p5/m, z26.s, z17.s, 0 \n\t" \ - "fcmla z18.s, p5/m, z24.s, z14.s, 90 \n\t" \ - "fcmla z21.s, p5/m, z24.s, z17.s, 90 \n\t" \ - "fcmla z19.s, p5/m, z25.s, z14.s, 90 \n\t" \ - "fcmla z22.s, p5/m, z25.s, z17.s, 90 \n\t" \ - "fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \ - "fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_PROJ -#define XP_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z18.s, 90 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z19.s, 90 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z20.s, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_RECON -#define XP_RECON_A64FXf \ -asm ( \ - "movprfx z6.s, p5/m, z31.s \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ - "movprfx z7.s, p5/m, z31.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ - "movprfx z8.s, p5/m, z31.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ - "movprfx z9.s, p5/m, z31.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ - "movprfx z10.s, p5/m, z31.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ - "movprfx z11.s, p5/m, z31.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ - "mov z0.s, p5/m, z18.s \n\t" \ - "mov z1.s, p5/m, z19.s \n\t" \ - "mov z2.s, p5/m, z20.s \n\t" \ - "mov z3.s, p5/m, z21.s \n\t" \ - "mov z4.s, p5/m, z22.s \n\t" \ - "mov z5.s, p5/m, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// XP_RECON_ACCUM -#define XP_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_PROJ -#define YP_PROJ_A64FXf \ -{ \ -asm ( \ - "fsub z12.s, p5/m, z12.s, z21.s \n\t" \ - "fsub z13.s, p5/m, z13.s, z22.s \n\t" \ - "fsub z14.s, p5/m, z14.s, z23.s \n\t" \ - "fadd z15.s, p5/m, z15.s, z18.s \n\t" \ - "fadd z16.s, p5/m, z16.s, z19.s \n\t" \ - "fadd z17.s, p5/m, z17.s, z20.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZP_PROJ -#define ZP_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z18.s, 90 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z19.s, 90 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z20.s, 90 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z21.s, 270 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z22.s, 270 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z23.s, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TP_PROJ -#define TP_PROJ_A64FXf \ -{ \ -asm ( \ - "fadd z12.s, p5/m, z12.s, z18.s \n\t" \ - "fadd z13.s, p5/m, z13.s, z19.s \n\t" \ - "fadd z14.s, p5/m, z14.s, z20.s \n\t" \ - "fadd z15.s, p5/m, z15.s, z21.s \n\t" \ - "fadd z16.s, p5/m, z16.s, z22.s \n\t" \ - "fadd z17.s, p5/m, z17.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_PROJ -#define XM_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z21.s, 270 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z22.s, 270 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z23.s, 270 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z18.s, 270 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z19.s, 270 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z20.s, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON -#define XM_RECON_A64FXf \ -asm ( \ - "movprfx z6.s, p5/m, z31.s \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ - "movprfx z7.s, p5/m, z31.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ - "movprfx z8.s, p5/m, z31.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ - "movprfx z9.s, p5/m, z31.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ - "movprfx z10.s, p5/m, z31.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ - "movprfx z11.s, p5/m, z31.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ - "mov z0.s, p5/m, z18.s \n\t" \ - "mov z1.s, p5/m, z19.s \n\t" \ - "mov z2.s, p5/m, z20.s \n\t" \ - "mov z3.s, p5/m, z21.s \n\t" \ - "mov z4.s, p5/m, z22.s \n\t" \ - "mov z5.s, p5/m, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_PROJ -#define YM_PROJ_A64FXf \ -{ \ -asm ( \ - "fadd z12.s, p5/m, z12.s, z21.s \n\t" \ - "fadd z13.s, p5/m, z13.s, z22.s \n\t" \ - "fadd z14.s, p5/m, z14.s, z23.s \n\t" \ - "fsub z15.s, p5/m, z15.s, z18.s \n\t" \ - "fsub z16.s, p5/m, z16.s, z19.s \n\t" \ - "fsub z17.s, p5/m, z17.s, z20.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZM_PROJ -#define ZM_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z18.s, 270 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z19.s, 270 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z20.s, 270 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z21.s, 90 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z22.s, 90 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z23.s, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TM_PROJ -#define TM_PROJ_A64FXf \ -{ \ -asm ( \ - "fsub z12.s, p5/m, z12.s, z18.s \n\t" \ - "fsub z13.s, p5/m, z13.s, z19.s \n\t" \ - "fsub z14.s, p5/m, z14.s, z20.s \n\t" \ - "fsub z15.s, p5/m, z15.s, z21.s \n\t" \ - "fsub z16.s, p5/m, z16.s, z22.s \n\t" \ - "fsub z17.s, p5/m, z17.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON_ACCUM -#define XM_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_RECON_ACCUM -#define YP_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fsub z9.s, p5/m, z9.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fsub z10.s, p5/m, z10.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fsub z11.s, p5/m, z11.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fadd z6.s, p5/m, z6.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fadd z7.s, p5/m, z7.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fadd z8.s, p5/m, z8.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_RECON_ACCUM -#define YM_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fadd z9.s, p5/m, z9.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fadd z10.s, p5/m, z10.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fadd z11.s, p5/m, z11.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fsub z6.s, p5/m, z6.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fsub z7.s, p5/m, z7.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fsub z8.s, p5/m, z8.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZP_RECON_ACCUM -#define ZP_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z6.s, p5/m, z6.s, z18.s, 270 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z19.s, 270 \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z20.s, 270 \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z21.s, 90 \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z22.s, 90 \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z23.s, 90 \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZM_RECON_ACCUM -#define ZM_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z6.s, p5/m, z6.s, z18.s, 90 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z19.s, 90 \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z20.s, 90 \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z21.s, 270 \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z22.s, 270 \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z23.s, 270 \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TP_RECON_ACCUM -#define TP_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fadd z7.s, p5/m, z7.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fadd z8.s, p5/m, z8.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fadd z9.s, p5/m, z9.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fadd z10.s, p5/m, z10.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fadd z11.s, p5/m, z11.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TM_RECON_ACCUM -#define TM_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fsub z6.s, p5/m, z6.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fsub z7.s, p5/m, z7.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fsub z8.s, p5/m, z8.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fsub z9.s, p5/m, z9.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fsub z10.s, p5/m, z10.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fsub z11.s, p5/m, z11.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZERO_PSI -#define ZERO_PSI_A64FXf \ -asm ( \ - "fmov z0.s , 0 \n\t" \ - "fmov z1.s , 0 \n\t" \ - "fmov z2.s , 0 \n\t" \ - "fmov z3.s , 0 \n\t" \ - "fmov z4.s , 0 \n\t" \ - "fmov z5.s , 0 \n\t" \ - "fmov z6.s , 0 \n\t" \ - "fmov z7.s , 0 \n\t" \ - "fmov z8.s , 0 \n\t" \ - "fmov z9.s , 0 \n\t" \ - "fmov z10.s , 0 \n\t" \ - "fmov z11.s , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) -#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "dc zva, %[fetchptr]\n\t" \ - "dc zva, %[fetchptr]\n\t" \ - "dc zva, %[fetchptr]\n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_RESULT_L1_STORE (prefetch store to L1) -#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// ADD_RESULT_INTERNAL -#define ADD_RESULT_INTERNAL_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z12.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z13.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z14.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z15.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z16.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z17.s \n\t" \ - "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ - "fadd z7.s, p5/m, z7.s, z19.s \n\t" \ - "fadd z8.s, p5/m, z8.s, z20.s \n\t" \ - "fadd z9.s, p5/m, z9.s, z21.s \n\t" \ - "fadd z10.s, p5/m, z10.s, z22.s \n\t" \ - "fadd z11.s, p5/m, z11.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - From 45d49d86487427ea1e0b34c0d530d475f8e3e31a Mon Sep 17 00:00:00 2001 From: Nils Meyer Date: Sat, 19 Dec 2020 03:35:18 +0100 Subject: [PATCH 096/201] clean up --- .../implementation/WilsonKernelsAsmBodyA64FX.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index 83588a7d..4e463438 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -26,9 +26,9 @@ Author: Nils Meyer Regensburg University *************************************************************************************/ /* END LEGAL */ -// GCC 10 messes up SVE instruction scheduling using -O3 only, -// using -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders -// performance is better than armclang 20.2 +// GCC 10 messes up SVE instruction scheduling using -O3, but +// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders +// performance now is better than armclang 20.2 #ifdef KERNEL_DAG #define DIR0_PROJ XP_PROJ @@ -118,10 +118,6 @@ Author: Nils Meyer Regensburg University /* NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty though I expected that it would improve on performance - - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ */ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ @@ -149,7 +145,7 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty if ( local || st.same_node[Dir] ) { \ MULT_2SPIN_1(Dir); \ MULT_2SPIN_2; \ - RECON; \ + RECON; \ } \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ PREFETCH_CHIMU(base); \ @@ -300,7 +296,7 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty // DC ZVA test // { uint64_t basestore = (uint64_t)&out[ss]; - // PREFETCH_RESULT_L2_STORE(basestore); } + // PREFETCH_RESULT_L2_STORE(basestore); } ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON); @@ -336,8 +332,8 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty // DC ZVA test //{ uint64_t basestore = (uint64_t)&out[ss]; - // PREFETCH_RESULT_L2_STORE(basestore); - //} + // PREFETCH_RESULT_L2_STORE(basestore); } + ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON); From e759367d42c14c2de4eac0349b9800b780b63988 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Fri, 8 Jan 2021 18:04:50 +0000 Subject: [PATCH 097/201] tested and working --- Grid/qcd/utils/BaryonUtils.h | 906 +++++++++++++++++------------------ 1 file changed, 427 insertions(+), 479 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 25c71e3a..35358d05 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -40,14 +40,7 @@ public: typedef typename FImpl::FermionField FermionField; typedef typename FImpl::PropagatorField PropagatorField; - typedef typename FImpl::SitePropagator pobj; - typedef typename ComplexField::vector_object vobj; - typedef Lattice> SpinMatrixField; - //typedef typename SpinMatrixField::vector_object sobj; - - //static const int epsilon[6][3] ; - //static const Real epsilon_sgn[6]; private: template accelerator_inline @@ -122,7 +115,7 @@ public: static void BaryonGamma3ptGroup1Site( const mobj &Dq1_ti, const mobj2 &Dq2_spec, - // const mobj2 &Dq3_spec, + const mobj2 &Dq3_spec, const mobj &Dq4_tf, const Gamma GammaJ, const Gamma GammaBi, @@ -134,7 +127,7 @@ public: static void BaryonGamma3ptGroup2Site( const mobj2 &Dq1_spec, const mobj &Dq2_ti, - //const mobj2 &Dq3_spec, + const mobj2 &Dq3_spec, const mobj &Dq4_tf, const Gamma GammaJ, const Gamma GammaBi, @@ -145,7 +138,7 @@ public: template accelerator_inline static void BaryonGamma3ptGroup3Site( const mobj2 &Dq1_spec, - //const mobj2 &Dq2_spec, + const mobj2 &Dq2_spec, const mobj &Dq3_ti, const mobj &Dq4_tf, const Gamma GammaJ, @@ -230,13 +223,7 @@ public: const std::string op, SpinMatrixField &stn_corr); }; -/* -template -const int BaryonUtils::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; -template -const Real BaryonUtils::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.}; -*/ -//This is the old version +//This computes a baryon contraction on a lattice site, including the spin-trace of the correlation matrix template template accelerator_inline void BaryonUtils::BaryonSite(const mobj &D1, @@ -251,20 +238,20 @@ void BaryonUtils::BaryonSite(const mobj &D1, robj &result) { - Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) + Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) - auto D1_GAi = D1 * GammaA_i; - auto D1_GAi_g4 = D1_GAi * g4; - auto D1_GAi_P = 0.5*(D1_GAi + (Real)parity * D1_GAi_g4); - auto GAf_D1_GAi_P = GammaA_f * D1_GAi_P; - auto GBf_D1_GAi_P = GammaB_f * D1_GAi_P; + auto D1_GAi = D1 * GammaA_i; + auto D1_GAi_g4 = D1_GAi * g4; + auto D1_GAi_P = 0.5*(D1_GAi + (Real)parity * D1_GAi_g4); + auto GAf_D1_GAi_P = GammaA_f * D1_GAi_P; + auto GBf_D1_GAi_P = GammaB_f * D1_GAi_P; - auto D2_GBi = D2 * GammaB_i; - auto GBf_D2_GBi = GammaB_f * D2_GBi; - auto GAf_D2_GBi = GammaA_f * D2_GBi; + auto D2_GBi = D2 * GammaB_i; + auto GBf_D2_GBi = GammaB_f * D2_GBi; + auto GAf_D2_GBi = GammaA_f * D2_GBi; - auto GBf_D3 = GammaB_f * D3; - auto GAf_D3 = GammaA_f * D3; + auto GBf_D3 = GammaB_f * D3; + auto GAf_D3 = GammaA_f * D3; Real ee; @@ -273,86 +260,87 @@ void BaryonUtils::BaryonSite(const mobj &D1, int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c int eSgn_f = (ie_f < 3 ? 1 : -1); - for (int ie_i=0; ie_i < 6 ; ie_i++){ - int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' - int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' - int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' - int eSgn_i = (ie_i < 3 ? 1 : -1); + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); - ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; - //This is the \delta_{456}^{123} part - if (wick_contraction[0]){ - for (int rho=0; rho::BaryonSiteMatrix(const mobj &D1, robj &result) { - auto D1_GAi = D1 * GammaA_i; - auto GAf_D1_GAi = GammaA_f * D1_GAi; - auto GBf_D1_GAi = GammaB_f * D1_GAi; + auto D1_GAi = D1 * GammaA_i; + auto GAf_D1_GAi = GammaA_f * D1_GAi; + auto GBf_D1_GAi = GammaB_f * D1_GAi; - auto D2_GBi = D2 * GammaB_i; - auto GBf_D2_GBi = GammaB_f * D2_GBi; - auto GAf_D2_GBi = GammaA_f * D2_GBi; - - auto GBf_D3 = GammaB_f * D3; - auto GAf_D3 = GammaA_f * D3; + auto D2_GBi = D2 * GammaB_i; + auto GBf_D2_GBi = GammaB_f * D2_GBi; + auto GAf_D2_GBi = GammaA_f * D2_GBi; + auto GBf_D3 = GammaB_f * D3; + auto GAf_D3 = GammaA_f * D3; Real ee; @@ -388,96 +375,101 @@ void BaryonUtils::BaryonSiteMatrix(const mobj &D1, int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c int eSgn_f = (ie_f < 3 ? 1 : -1); - for (int ie_i=0; ie_i < 6 ; ie_i++){ - int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' - int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' - int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' - int eSgn_i = (ie_i < 3 ? 1 : -1); + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); - ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; - //This is the \delta_{456}^{123} part - if (wick_contraction[0]){ - for (int rho_i=0; rho_i::ContractBaryons(const PropagatorField &q1_left, for (int ie=0; ie < 6 ; ie++){ if(ie==0 or ie==3){ bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie]; - } - else{ + } else{ bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie]; } } @@ -653,7 +644,7 @@ template accelerator_inline void BaryonUtils::BaryonGamma3ptGroup1Site( const mobj &Dq1_ti, const mobj2 &Dq2_spec, - // const mobj2 &Dq3_spec, + const mobj2 &Dq3_spec, const mobj &Dq4_tf, const Gamma GammaJ, const Gamma GammaBi, @@ -661,18 +652,14 @@ void BaryonUtils::BaryonGamma3ptGroup1Site( int wick_contraction, robj &result) { - Gamma g5(Gamma::Algebra::Gamma5); - -// auto adjD4_g_D1 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq1_ti; - auto adjD4 = g5 * adj(Dq4_tf) * g5 ; - auto adjD4_g_D1 = adjD4 * GammaJ * Dq1_ti; - auto Gf_adjD4_g_D1 = GammaBf * adjD4_g_D1; - auto D2_Gi = Dq2_spec * GammaBi; - auto Gf_D2_Gi = GammaBf * D2_Gi; - -// auto Gf_D3 = GammaBf * Dq3_spec; // including a second mobj2 parameter leads to compilation error - auto Gf_D3 = GammaBf * Dq2_spec; //WRONG!!!!! + Gamma g5(Gamma::Algebra::Gamma5); + auto adjD4 = g5 * adj(Dq4_tf) * g5 ; + auto adjD4_g_D1 = adjD4 * GammaJ * Dq1_ti; + auto Gf_adjD4_g_D1 = GammaBf * adjD4_g_D1; + auto D2_Gi = Dq2_spec * GammaBi; + auto Gf_D2_Gi = GammaBf * D2_Gi; + auto Gf_D3 = GammaBf * Dq3_spec; Real ee; @@ -681,65 +668,65 @@ void BaryonUtils::BaryonGamma3ptGroup1Site( int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c int eSgn_f = (ie_f < 3 ? 1 : -1); - for (int ie_i=0; ie_i < 6 ; ie_i++){ - int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' - int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' - int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' - int eSgn_i = (ie_i < 3 ? 1 : -1); + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); - ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; + ee = Real(eSgn_f * eSgn_i); - for (int alpha_f=0; alpha_f accelerator_inline void BaryonUtils::BaryonGamma3ptGroup2Site( const mobj2 &Dq1_spec, const mobj &Dq2_ti, - // const mobj2 &Dq3_spec, + const mobj2 &Dq3_spec, const mobj &Dq4_tf, const Gamma GammaJ, const Gamma GammaBi, @@ -759,14 +746,12 @@ void BaryonUtils::BaryonGamma3ptGroup2Site( int wick_contraction, robj &result) { - Gamma g5(Gamma::Algebra::Gamma5); - - auto adjD4_g_D2_Gi = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq2_ti * GammaBi; - auto Gf_adjD4_g_D2_Gi = GammaBf * adjD4_g_D2_Gi; - auto Gf_D1 = GammaBf * Dq1_spec; - //auto Gf_D3 = GammaBf * Dq3_spec; - auto Gf_D3 = GammaBf * Dq1_spec; // WRONG!!!!! + Gamma g5(Gamma::Algebra::Gamma5); + auto adjD4_g_D2_Gi = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq2_ti * GammaBi; + auto Gf_adjD4_g_D2_Gi = GammaBf * adjD4_g_D2_Gi; + auto Gf_D1 = GammaBf * Dq1_spec; + auto Gf_D3 = GammaBf * Dq3_spec; Real ee; @@ -775,64 +760,64 @@ void BaryonUtils::BaryonGamma3ptGroup2Site( int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c int eSgn_f = (ie_f < 3 ? 1 : -1); - for (int ie_i=0; ie_i < 6 ; ie_i++){ - int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' - int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' - int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' - int eSgn_i = (ie_i < 3 ? 1 : -1); + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); - ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; + ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; - for (int alpha_f=0; alpha_f template accelerator_inline void BaryonUtils::BaryonGamma3ptGroup3Site( const mobj2 &Dq1_spec, - // const mobj2 &Dq2_spec, + const mobj2 &Dq2_spec, const mobj &Dq3_ti, const mobj &Dq4_tf, const Gamma GammaJ, @@ -852,15 +837,13 @@ void BaryonUtils::BaryonGamma3ptGroup3Site( int wick_contraction, robj &result) { - Gamma g5(Gamma::Algebra::Gamma5); - - auto adjD4_g_D3 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq3_ti; - auto Gf_adjD4_g_D3 = GammaBf * adjD4_g_D3; - auto Gf_D1 = GammaBf * Dq1_spec; - //auto D2_Gi = Dq2_spec * GammaBi; - auto D2_Gi = Dq1_spec * GammaBi; //WRONG!!!!!!!!!!!!!!!!! - auto Gf_D2_Gi = GammaBf * D2_Gi; + Gamma g5(Gamma::Algebra::Gamma5); + auto adjD4_g_D3 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq3_ti; + auto Gf_adjD4_g_D3 = GammaBf * adjD4_g_D3; + auto Gf_D1 = GammaBf * Dq1_spec; + auto D2_Gi = Dq2_spec * GammaBi; + auto Gf_D2_Gi = GammaBf * D2_Gi; Real ee; @@ -869,62 +852,64 @@ void BaryonUtils::BaryonGamma3ptGroup3Site( int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c int eSgn_f = (ie_f < 3 ? 1 : -1); - for (int ie_i=0; ie_i < 6 ; ie_i++){ - int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' - int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' - int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' - int eSgn_i = (ie_i < 3 ? 1 : -1); + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); - ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; + ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; - for (int alpha_f=0; alpha_f::BaryonGamma3pt( assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - GridBase *grid = q_tf.Grid(); + GridBase *grid = q_tf.Grid(); - // autoView( vcorr, stn_corr, CpuWrite); - // autoView( vq_ti , q_ti, CpuRead); - // autoView( vq_tf , q_tf, CpuRead); + autoView( vcorr , stn_corr , AcceleratorWrite); + autoView( vq_ti , q_ti , AcceleratorRead); + autoView( vq_tf , q_tf , AcceleratorRead); - // if (group == 1) { - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - // auto Dq_ti = vq_ti[ss]; - // auto Dq_tf = vq_tf[ss]; - // sobj result=Zero(); - // BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - // vcorr[ss] += result; - // });//end loop over lattice sites - // } else if (group == 2) { - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - // auto Dq_ti = vq_ti[ss]; - // auto Dq_tf = vq_tf[ss]; - // sobj result=Zero(); - // BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - // vcorr[ss] += result; - // });//end loop over lattice sites - // } else if (group == 3) { - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - // auto Dq_ti = vq_ti[ss]; - // auto Dq_tf = vq_tf[ss]; - // sobj result=Zero(); - // BaryonGamma3ptGroup3Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + Vector my_Dq_spec{Dq_spec1,Dq_spec2}; + mobj * Dq_spec_p = &my_Dq_spec[0]; - // vcorr[ss] += result; - // });//end loop over lattice sites - // } + if (group == 1) { + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti(ss); + auto Dq_tf = vq_tf(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec_p[0],Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + coalescedWrite(vcorr[ss],result); + });//end loop over lattice sites - autoView( vcorr , stn_corr , AcceleratorWrite); - autoView( vq_ti , q_ti , AcceleratorRead); - autoView( vq_tf , q_tf , AcceleratorRead); - - if (group == 1) { - accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_ti = vq_ti(ss); - auto Dq_tf = vq_tf(ss); - //sobj result=Zero(); - typedef decltype(coalescedRead(vcorr[0])) spinor; - spinor result=Zero(); - //BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); //WRONG - // vcorr[ss] += result; - coalescedWrite(vcorr[ss],result); - });//end loop over lattice sites - - } else if (group == 2) { - accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_ti = vq_ti(ss); - auto Dq_tf = vq_tf(ss); - //sobj result=Zero(); - typedef decltype(coalescedRead(vcorr[0])) spinor; - spinor result=Zero(); - // BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); //WRONG - // vcorr[ss] += result; - coalescedWrite(vcorr[ss],result); - });//end loop over lattice sites - } else if (group == 3) { - accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_ti = vq_ti(ss); - auto Dq_tf = vq_tf(ss); - //sobj result=Zero(); - typedef decltype(coalescedRead(vcorr[0])) spinor; - spinor result=Zero(); - //BaryonGamma3ptGroup3Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - BaryonGamma3ptGroup3Site(Dq_spec1,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); //WRONG - // vcorr[ss] += result; - coalescedWrite(vcorr[ss],result); - });//end loop over lattice sites - } + } else if (group == 2) { + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti(ss); + auto Dq_tf = vq_tf(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + BaryonGamma3ptGroup2Site(Dq_spec_p[0],Dq_ti,Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + coalescedWrite(vcorr[ss],result); + });//end loop over lattice sites + } else if (group == 3) { + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti(ss); + auto Dq_tf = vq_tf(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + BaryonGamma3ptGroup3Site(Dq_spec_p[0],Dq_spec_p[1],Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + coalescedWrite(vcorr[ss],result); + });//end loop over lattice sites + } } @@ -1052,7 +1000,6 @@ void BaryonUtils::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, Gamma g5(Gamma::Algebra::Gamma5); - //auto Gn_adjDd_GH_Ds = GammaB_nucl * g5 * adj(Dd_tf) * g5 * Gamma_H * Ds_ti; auto adjDd_GH_Ds = g5 * adj(Dd_tf) * g5 * Gamma_H * Ds_ti; auto Gn_adjDd_GH_Ds = GammaB_nucl * adjDd_GH_Ds; auto Du_Gs = Du_spec * GammaB_sigma; @@ -1066,33 +1013,33 @@ void BaryonUtils::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, int b_n = (ie_n < 3 ? (ie_n+1)%3 : (8-ie_n)%3 ); //epsilon[ie_n][1]; //b int c_n = (ie_n < 3 ? (ie_n+2)%3 : (7-ie_n)%3 ); //epsilon[ie_n][2]; //c int eSgn_n = (ie_n < 3 ? 1 : -1); - for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = (ie_s < 3 ? ie_s : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a' - int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b' - int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c' - int eSgn_s = (ie_s < 3 ? 1 : -1); + for (int ie_s=0; ie_s < 6 ; ie_s++){ + int a_s = (ie_s < 3 ? ie_s : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a' + int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b' + int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_s = (ie_s < 3 ? 1 : -1); - ee = Real(eSgn_n * eSgn_s); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; + ee = Real(eSgn_n * eSgn_s); + for (int alpha_n=0; alpha_n::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, Gamma g5(Gamma::Algebra::Gamma5); auto Du_Gs = Du_spec * GammaB_sigma; - //auto Gn_adjDd_GH_Ds = GammaB_nucl * g5 * adj(Dd_tf) * g5 * Gamma_H * Ds_ti; auto adjDd_GH_Ds = g5 * adj(Dd_tf) * g5 * Gamma_H * Ds_ti; auto Gn_adjDd_GH_Ds = GammaB_nucl * adjDd_GH_Ds; auto adjDu_GH_Du = g5 * adj(Du_tf) * g5 * Gamma_H * Du_ti; @@ -1129,40 +1075,41 @@ void BaryonUtils::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, int b_n = (ie_n < 3 ? (ie_n+1)%3 : (8-ie_n)%3 ); //epsilon[ie_n][1]; //b int c_n = (ie_n < 3 ? (ie_n+2)%3 : (7-ie_n)%3 ); //epsilon[ie_n][2]; //c int eSgn_n = (ie_n < 3 ? 1 : -1); - for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = (ie_s < 3 ? ie_s : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a' - int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b' - int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c' - int eSgn_s = (ie_s < 3 ? 1 : -1); + for (int ie_s=0; ie_s < 6 ; ie_s++){ + int a_s = (ie_s < 3 ? ie_s : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a' + int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b' + int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_s = (ie_s < 3 ? 1 : -1); - ee = Real(eSgn_n * eSgn_s); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; + ee = Real(eSgn_n * eSgn_s); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; - for (int alpha_n=0; alpha_n::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, Gamma g5(Gamma::Algebra::Gamma5); - //auto Gn_adjDd_GH_Duloop_GH_Ds = GammaB_nucl * g5 * adj(Dd_tf) * g5 * Gamma_H * Dq_loop * Gamma_H * Ds_ti; auto adjDd_GH_Duloop_GH_Ds = g5 * adj(Dd_tf) * g5 * Gamma_H * Dq_loop * Gamma_H * Ds_ti; auto Gn_adjDd_GH_Duloop_GH_Ds = GammaB_nucl * adjDd_GH_Duloop_GH_Ds; auto Du_Gs = Du_spec * GammaB_sigma; @@ -1196,32 +1142,33 @@ void BaryonUtils::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, int b_n = (ie_n < 3 ? (ie_n+1)%3 : (8-ie_n)%3 ); //epsilon[ie_n][1]; //b int c_n = (ie_n < 3 ? (ie_n+2)%3 : (7-ie_n)%3 ); //epsilon[ie_n][2]; //c int eSgn_n = (ie_n < 3 ? 1 : -1); - for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = (ie_s < 3 ? ie_s : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a' - int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b' - int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c' - int eSgn_s = (ie_s < 3 ? 1 : -1); + for (int ie_s=0; ie_s < 6 ; ie_s++){ + int a_s = (ie_s < 3 ? ie_s : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a' + int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b' + int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_s = (ie_s < 3 ? 1 : -1); - ee = Real(eSgn_n * eSgn_s); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; + ee = Real(eSgn_n * eSgn_s); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; - for (int alpha_n=0; alpha_n::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, auto Du_Gs = Du_spec * GammaB_sigma; auto adjDu_GH_Ds = g5 * adj(Du_tf) * g5 * Gamma_H * Ds_ti; - //auto Gn_adjDd_GH_Du = GammaB_nucl * g5 * adj(Dd_tf) * g5 * Gamma_H * Du_ti; auto adjDd_GH_Du = g5 * adj(Dd_tf) * g5 * Gamma_H * Du_ti; auto Gn_adjDd_GH_Du = GammaB_nucl * adjDd_GH_Du; // for some reason I needed to split this into two lines to avoid the compilation error 'error: identifier "Grid::Gamma::mul" is undefined in device code' @@ -1259,43 +1205,45 @@ void BaryonUtils::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, int b_n = (ie_n < 3 ? (ie_n+1)%3 : (8-ie_n)%3 ); //epsilon[ie_n][1]; //b int c_n = (ie_n < 3 ? (ie_n+2)%3 : (7-ie_n)%3 ); //epsilon[ie_n][2]; //c int eSgn_n = (ie_n < 3 ? 1 : -1); - for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = (ie_s < 3 ? ie_s : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a' - int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b' - int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c' - int eSgn_s = (ie_s < 3 ? 1 : -1); + for (int ie_s=0; ie_s < 6 ; ie_s++){ + int a_s = (ie_s < 3 ? ie_s : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a' + int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b' + int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_s = (ie_s < 3 ? 1 : -1); - ee = Real(eSgn_n * eSgn_s); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; + ee = Real(eSgn_n * eSgn_s); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; - for (int alpha_n=0; alpha_n From 74de2d9742de7a85850cf40cbbe0215edc3405da Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Fri, 8 Jan 2021 18:28:36 +0000 Subject: [PATCH 098/201] whitespace changes --- Grid/qcd/utils/BaryonUtils.h | 156 +++++++++++++++++------------------ 1 file changed, 78 insertions(+), 78 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 35358d05..80a80a76 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -53,7 +53,7 @@ public: const Gamma GammaB_right, const int parity, const bool * wick_contractions, - robj &result); + robj &result); template accelerator_inline static void BaryonSiteMatrix(const mobj &D1, const mobj &D2, @@ -63,7 +63,7 @@ public: const Gamma GammaA_right, const Gamma GammaB_right, const bool * wick_contractions, - robj &result); + robj &result); public: static void WickContractions(std::string qi, std::string qf, @@ -117,9 +117,9 @@ public: const mobj2 &Dq2_spec, const mobj2 &Dq3_spec, const mobj &Dq4_tf, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, int wick_contraction, robj &result); @@ -129,9 +129,9 @@ public: const mobj &Dq2_ti, const mobj2 &Dq3_spec, const mobj &Dq4_tf, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, int wick_contraction, robj &result); @@ -141,9 +141,9 @@ public: const mobj2 &Dq2_spec, const mobj &Dq3_ti, const mobj &Dq4_tf, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, int wick_contraction, robj &result); public: @@ -155,9 +155,9 @@ public: const PropagatorField &q_tf, int group, int wick_contraction, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, SpinMatrixField &stn_corr); private: template accelerator_inline @@ -165,9 +165,9 @@ public: const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, robj &result); template accelerator_inline static void SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, @@ -175,9 +175,9 @@ public: const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, robj &result); @@ -186,9 +186,9 @@ public: const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, robj &result); template accelerator_inline static void SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, @@ -196,9 +196,9 @@ public: const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, robj &result); public: template @@ -209,7 +209,7 @@ public: const Gamma Gamma_H, const Gamma GammaB_sigma, const Gamma GammaB_nucl, - const std::string op, + const std::string op, SpinMatrixField &stn_corr); template static void SigmaToNucleonNonEye(const PropagatorField &qq_ti, @@ -220,7 +220,7 @@ public: const Gamma Gamma_H, const Gamma GammaB_sigma, const Gamma GammaB_nucl, - const std::string op, + const std::string op, SpinMatrixField &stn_corr); }; //This computes a baryon contraction on a lattice site, including the spin-trace of the correlation matrix @@ -229,10 +229,10 @@ template accelerator_inline void BaryonUtils::BaryonSite(const mobj &D1, const mobj &D2, const mobj &D3, - const Gamma GammaA_i, - const Gamma GammaB_i, - const Gamma GammaA_f, - const Gamma GammaB_f, + const Gamma GammaA_i, + const Gamma GammaB_i, + const Gamma GammaA_f, + const Gamma GammaB_f, const int parity, const bool * wick_contraction, robj &result) @@ -349,10 +349,10 @@ template accelerator_inline void BaryonUtils::BaryonSiteMatrix(const mobj &D1, const mobj &D2, const mobj &D3, - const Gamma GammaA_i, - const Gamma GammaB_i, - const Gamma GammaA_f, - const Gamma GammaB_f, + const Gamma GammaA_i, + const Gamma GammaB_i, + const Gamma GammaA_f, + const Gamma GammaB_f, const bool * wick_contraction, robj &result) { @@ -496,10 +496,10 @@ template void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const PropagatorField &q2_left, const PropagatorField &q3_left, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, const bool* wick_contractions, const int parity, ComplexField &baryon_corr) @@ -548,10 +548,10 @@ template void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, const PropagatorField &q2_left, const PropagatorField &q3_left, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, const bool* wick_contractions, SpinMatrixField &baryon_corr) { @@ -612,10 +612,10 @@ template void BaryonUtils::ContractBaryonsSlicedMatrix(const mobj &D1, const mobj &D2, const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, const bool* wick_contractions, const int nt, robj &result) @@ -646,9 +646,9 @@ void BaryonUtils::BaryonGamma3ptGroup1Site( const mobj2 &Dq2_spec, const mobj2 &Dq3_spec, const mobj &Dq4_tf, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, int wick_contraction, robj &result) { @@ -740,9 +740,9 @@ void BaryonUtils::BaryonGamma3ptGroup2Site( const mobj &Dq2_ti, const mobj2 &Dq3_spec, const mobj &Dq4_tf, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, int wick_contraction, robj &result) { @@ -831,9 +831,9 @@ void BaryonUtils::BaryonGamma3ptGroup3Site( const mobj2 &Dq2_spec, const mobj &Dq3_ti, const mobj &Dq4_tf, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, int wick_contraction, robj &result) { @@ -926,9 +926,9 @@ void BaryonUtils::BaryonGamma3pt( const PropagatorField &q_tf, int group, int wick_contraction, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, SpinMatrixField &stn_corr) { assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); @@ -992,9 +992,9 @@ void BaryonUtils::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, robj &result) { @@ -1054,9 +1054,9 @@ void BaryonUtils::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, robj &result) { @@ -1123,9 +1123,9 @@ void BaryonUtils::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, robj &result) { @@ -1183,9 +1183,9 @@ void BaryonUtils::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, robj &result) { @@ -1252,9 +1252,9 @@ void BaryonUtils::SigmaToNucleonEye(const PropagatorField &qq_loop, const mobj &Du_spec, const PropagatorField &qd_tf, const PropagatorField &qs_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, const std::string op, SpinMatrixField &stn_corr) { @@ -1296,9 +1296,9 @@ void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, const mobj &Du_spec, const PropagatorField &qd_tf, const PropagatorField &qs_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, const std::string op, SpinMatrixField &stn_corr) { From 45fc7ded3a18b8fc905127fb4d987a5bb16654e1 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Tue, 12 Jan 2021 09:10:37 +0000 Subject: [PATCH 099/201] test for sum --- Grid/qcd/utils/BaryonUtils.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 80a80a76..828b4085 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -950,7 +950,8 @@ void BaryonUtils::BaryonGamma3pt( typedef decltype(coalescedRead(vcorr[0])) spinor; spinor result=Zero(); BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec_p[0],Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - coalescedWrite(vcorr[ss],result); + //coalescedWrite(vcorr[ss],vcorr[ss]+result); //diff by factor 10??? + coalescedWrite(vcorr[ss],vcorr[ss]+result); });//end loop over lattice sites } else if (group == 2) { @@ -1271,6 +1272,9 @@ void BaryonUtils::SigmaToNucleonEye(const PropagatorField &qq_loop, bool doQ1 = (op == "Q1"); bool doQ2 = (op == "Q2"); + + Vector my_Dq_spec{Du_spec}; + mobj * Dq_spec_p = &my_Dq_spec[0]; accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto Dq_loop = vq_loop(ss); @@ -1279,9 +1283,9 @@ void BaryonUtils::SigmaToNucleonEye(const PropagatorField &qq_loop, typedef decltype(coalescedRead(vcorr[0])) spinor; spinor result=Zero(); if(doQ1){ - SigmaToNucleonQ1EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ1EyeSite(Dq_loop,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else if(doQ2){ - SigmaToNucleonQ2EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ2EyeSite(Dq_loop,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else { assert(0 && "Weak Operator not correctly specified"); } @@ -1316,6 +1320,9 @@ void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, bool doQ1 = (op == "Q1"); bool doQ2 = (op == "Q2"); + + Vector my_Dq_spec{Du_spec}; + mobj * Dq_spec_p = &my_Dq_spec[0]; accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto Dq_ti = vq_ti(ss); @@ -1325,9 +1332,9 @@ void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, typedef decltype(coalescedRead(vcorr[0])) spinor; spinor result=Zero(); if(doQ1){ - SigmaToNucleonQ1NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ1NonEyeSite(Dq_ti,Dq_tf,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else if(doQ2){ - SigmaToNucleonQ2NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ2NonEyeSite(Dq_ti,Dq_tf,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else { assert(0 && "Weak Operator not correctly specified"); } From fa12b9a3295f1fdf392b0367de4136e576b5401e Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Wed, 13 Jan 2021 10:04:17 +0000 Subject: [PATCH 100/201] bugfix --- Grid/qcd/utils/BaryonUtils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 828b4085..69bf8959 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -961,7 +961,7 @@ void BaryonUtils::BaryonGamma3pt( typedef decltype(coalescedRead(vcorr[0])) spinor; spinor result=Zero(); BaryonGamma3ptGroup2Site(Dq_spec_p[0],Dq_ti,Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - coalescedWrite(vcorr[ss],result); + coalescedWrite(vcorr[ss],vcorr[ss]+result); });//end loop over lattice sites } else if (group == 3) { accelerator_for(ss, grid->oSites(), grid->Nsimd(), { @@ -970,7 +970,7 @@ void BaryonUtils::BaryonGamma3pt( typedef decltype(coalescedRead(vcorr[0])) spinor; spinor result=Zero(); BaryonGamma3ptGroup3Site(Dq_spec_p[0],Dq_spec_p[1],Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - coalescedWrite(vcorr[ss],result); + coalescedWrite(vcorr[ss],vcorr[ss]+result); });//end loop over lattice sites } From a4afc3ea2aeb23a5a5a4dece03087e6344c9986b Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 14 Jan 2021 20:44:16 -0500 Subject: [PATCH 101/201] Red black coarse space --- tests/solver/Test_dwf_hdcr.cc | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index 8e083231..f68e99ab 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -222,9 +222,16 @@ int main (int argc, char ** argv) GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());; GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d); - GridCartesian *CoarseCoarse4d = SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());; + + + GridCartesian *CoarseCoarse4d = SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridCartesian *CoarseCoarse5d = SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d); + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d); + GridRedBlackCartesian *CoarseCoarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseCoarse4d); + GridRedBlackCartesian *CoarseCoarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,CoarseCoarse4d); + std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); std::vector cseeds({5,6,7,8}); @@ -282,8 +289,7 @@ int main (int argc, char ** argv) Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); - Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); - + Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); ////////////////////////////////////////////////// // Deflate the course space. Recursive multigrid? @@ -311,12 +317,11 @@ int main (int argc, char ** argv) } } - Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix + Level2Op L2Op(*CoarseCoarse5d,*CoarseCoarse5dRB,1); // Hermitian matrix typedef Level2Op::CoarseVector CoarseCoarseVector; HermitianLinearOperator L1LinOp(LDOp); L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates); - std::cout< Date: Thu, 14 Jan 2021 20:46:21 -0500 Subject: [PATCH 102/201] Coarsened vector test --- Grid/qcd/QCD.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h index 76d7def4..858aead7 100644 --- a/Grid/qcd/QCD.h +++ b/Grid/qcd/QCD.h @@ -80,6 +80,13 @@ template struct isSpinor { template using IfSpinor = Invoke::value,int> > ; template using IfNotSpinor = Invoke::value,int> > ; +const int CoarseIndex = 4; +template struct isCoarsened { + static constexpr bool value = (CoarseIndex<=T::TensorLevel); +}; +template using IfCoarsened = Invoke::value,int> > ; +template using IfNotCoarsened = Invoke::value,int> > ; + // ChrisK very keen to add extra space for Gparity doubling. // // Also add domain wall index, in a way where Wilson operator From eaff0f3aeb05635d49e17cb6e271621040f5b7f1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 14 Jan 2021 20:46:58 -0500 Subject: [PATCH 103/201] Gamma5 on coaree spaces --- Grid/qcd/spin/TwoSpinor.h | 179 ++++++++------------------------------ 1 file changed, 35 insertions(+), 144 deletions(-) diff --git a/Grid/qcd/spin/TwoSpinor.h b/Grid/qcd/spin/TwoSpinor.h index 924594ab..8dad0cd0 100644 --- a/Grid/qcd/spin/TwoSpinor.h +++ b/Grid/qcd/spin/TwoSpinor.h @@ -128,7 +128,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spProjTm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; hspin(0)=fspin(0)-fspin(2); hspin(1)=fspin(1)-fspin(3); } @@ -138,40 +137,50 @@ template > = 0> accelerator_inline void s * 0 0 -1 0 * 0 0 0 -1 */ - template > = 0> accelerator_inline void spProj5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; hspin(0)=fspin(0); hspin(1)=fspin(1); } template > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; hspin(0)=fspin(2); hspin(1)=fspin(3); } -// template accelerator_inline void fspProj5p (iVector &rfspin,const iVector &fspin) template > = 0> accelerator_inline void spProj5p (iVector &rfspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; rfspin(0)=fspin(0); rfspin(1)=fspin(1); rfspin(2)=Zero(); rfspin(3)=Zero(); } -// template accelerator_inline void fspProj5m (iVector &rfspin,const iVector &fspin) template > = 0> accelerator_inline void spProj5m (iVector &rfspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; rfspin(0)=Zero(); rfspin(1)=Zero(); rfspin(2)=fspin(2); rfspin(3)=fspin(3); } +template > = 0> accelerator_inline void spProj5p (iVector &rfspin,const iVector &fspin) +{ + const int hN = N>>1; + for(int s=0;s > = 0> accelerator_inline void spProj5m (iVector &rfspin,const iVector &fspin) +{ + const int hN = N>>1; + for(int s=0;s > = 0> accelerator_inline void s */ template > = 0> accelerator_inline void spReconXp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=timesMinusI(hspin(1)); @@ -191,7 +199,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spReconXm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=timesI(hspin(1)); @@ -199,7 +206,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumReconXp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)-=timesI(hspin(1)); @@ -207,7 +213,6 @@ template > = 0> accelerator_inline void a } template > = 0> accelerator_inline void accumReconXm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)+=timesI(hspin(1)); @@ -221,7 +226,6 @@ template > = 0> accelerator_inline void a template > = 0> accelerator_inline void spReconYp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)= hspin(1); @@ -229,7 +233,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spReconYm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=-hspin(1); @@ -237,7 +240,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumReconYp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)+=hspin(1); @@ -245,7 +247,6 @@ template > = 0> accelerator_inline void a } template > = 0> accelerator_inline void accumReconYm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)-=hspin(1); @@ -260,7 +261,6 @@ template > = 0> accelerator_inline void a */ template > = 0> accelerator_inline void spReconZp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=timesMinusI(hspin(0)); @@ -268,7 +268,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spReconZm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)= timesI(hspin(0)); @@ -276,7 +275,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumReconZp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)-=timesI(hspin(0)); @@ -284,7 +282,6 @@ template > = 0> accelerator_inline void a } template > = 0> accelerator_inline void accumReconZm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)+=timesI(hspin(0)); @@ -298,7 +295,6 @@ template > = 0> accelerator_inline void a */ template > = 0> accelerator_inline void spReconTp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=hspin(0); @@ -306,7 +302,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spReconTm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=-hspin(0); @@ -314,7 +309,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumReconTp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)+=hspin(0); @@ -322,7 +316,6 @@ template > = 0> accelerator_inline void a } template > = 0> accelerator_inline void accumReconTm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)-=hspin(0); @@ -336,7 +329,6 @@ template > = 0> accelerator_inline void a */ template > = 0> accelerator_inline void spRecon5p (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though fspin(2)=Zero(); @@ -344,7 +336,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spRecon5m (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=Zero(); fspin(1)=Zero(); fspin(2)=hspin(0)+hspin(0); @@ -352,7 +343,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumRecon5p (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0)+hspin(0); fspin(1)+=hspin(1)+hspin(1); } @@ -372,7 +362,6 @@ template > = 0> accelerator_inline void a ////////// template > = 0> accelerator_inline void spProjXp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconXp (iM }} } - - //////// // Xm //////// template accelerator_inline void spProjXm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjXm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjXm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjXm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjXm (iMatri template accelerator_inline void spReconXm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconXm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconXm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconXm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconXm (iMatr template accelerator_inline void accumReconXm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconXm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconXm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconXm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjYp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjYp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjYp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjYp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjYp (iMatri template accelerator_inline void spReconYp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconYp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconYp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconYp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconYp (iMatr template accelerator_inline void accumReconYp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconYp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconYp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconYp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjYm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjYm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjYm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjYm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconYm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconYm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconYm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,const iVector >::type *temp; for(int i=0;i accelerator_inline void spReconYm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconYm (iMatr template accelerator_inline void accumReconYm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconYm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconYm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconYm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconYm (iM //////// template accelerator_inline void spProjZp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjZp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjZp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjZp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconZp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconZp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconZp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconZp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconZp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconZp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconZp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconZp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconZp (iM //////// template accelerator_inline void spProjZm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjZm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjZm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjZm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconZm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconZm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconZm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconZm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconZm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconZm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconZm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconZm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconZm (iM //////// template accelerator_inline void spProjTp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjTp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjTp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjTp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconTp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconTp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconTp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconTp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconTp (iMatr template accelerator_inline void accumReconTp (iScalar &hspin, iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconTp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconTp (iVector &hspin, const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconTp (iMatrix &hspin, const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjTm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjTm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjTm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjTm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjTm (iMatri template accelerator_inline void spReconTm (iScalar &hspin, const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconTm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconTm (iVector &hspin, const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconTm (iMatrix &hspin, const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconTm (iMatr template accelerator_inline void accumReconTm (iScalar &hspin, const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconTm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconTm (iVector &hspin, const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconTm (iMatrix &hspin, const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProj5p (iScalar &hspin,const iScalar &fspin) +template > = 0> accelerator_inline void spProj5p (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProj5p(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProj5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) +template > = 0> accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProj5p (iMatri template accelerator_inline void spRecon5p (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spRecon5p(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spRecon5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spRecon5p (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spRecon5p (iMatr template accelerator_inline void accumRecon5p (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumRecon5p(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumRecon5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumRecon5p (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumRecon5p (iM } // four spinor projectors for chiral proj -// template accelerator_inline void fspProj5p (iScalar &hspin,const iScalar &fspin) -template accelerator_inline void spProj5p (iScalar &hspin,const iScalar &fspin) +template > = 0> accelerator_inline void spProj5p (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProj5p(hspin._internal,fspin._internal); } -// template accelerator_inline void fspProj5p (iVector &hspin,iVector &fspin) -template > = 0> accelerator_inline void spProj5p (iVector &hspin,const iVector &fspin) +template > = 0,IfNotCoarsened > = 0> accelerator_inline void spProj5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void fspProj5p (iMatrix &hspin,iMatrix &fspin) -template accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) +template > = 0> accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProj5p (iMatrix & // 5m //////// -template accelerator_inline void spProj5m (iScalar &hspin,const iScalar &fspin) +template > = 0> accelerator_inline void spProj5m (iScalar &hspin,const iScalar &fspin) { spProj5m(hspin._internal,fspin._internal); } -template > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) +template > = 0,IfNotCoarsened > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) { for(int i=0;i accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) +template > = 0> accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) { for(int i=0;i accelerator_inline void spProj5m (iMatri template accelerator_inline void spRecon5m (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spRecon5m(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spRecon5m (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spRecon5m (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumRecon5m (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumRecon5m(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumRecon5m (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumRecon5m (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumRecon5m (iM // four spinor projectors for chiral proj -// template accelerator_inline void fspProj5m (iScalar &hspin,const iScalar &fspin) -template accelerator_inline void spProj5m (iScalar &hspin,const iScalar &fspin) +template > = 0> accelerator_inline void spProj5m (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProj5m(hspin._internal,fspin._internal); } -// template accelerator_inline void fspProj5m (iVector &hspin,iVector &fspin) -template > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) +template > = 0,IfNotCoarsened > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void fspProj5m (iMatrix &hspin,iMatrix &fspin) -template accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) +template > = 0> accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i Date: Thu, 14 Jan 2021 20:47:28 -0500 Subject: [PATCH 104/201] G5 on coarse spaces --- Grid/qcd/utils/LinalgUtils.h | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/Grid/qcd/utils/LinalgUtils.h b/Grid/qcd/utils/LinalgUtils.h index 1e016e4e..964b83d5 100644 --- a/Grid/qcd/utils/LinalgUtils.h +++ b/Grid/qcd/utils/LinalgUtils.h @@ -154,8 +154,8 @@ void axpby_ssp_pminus(Lattice &z,Coeff a,const Lattice &x,Coeff b,co accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; decltype(coalescedRead(y_v[ss+sp])) tmp; - spProj5m(tmp,y_v(ss+sp)); - tmp = a*x_v(ss+s)+b*tmp; + spProj5m(tmp,y_v(ss+sp)); + tmp = a*x_v(ss+s)+b*tmp; coalescedWrite(z_v[ss+s],tmp); }); } @@ -188,7 +188,6 @@ void G5R5(Lattice &z,const Lattice &x) z.Checkerboard() = x.Checkerboard(); conformable(x,z); int Ls = grid->_rdimensions[0]; - Gamma G5(Gamma::Algebra::Gamma5); autoView( x_v, x, AcceleratorRead); autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; @@ -196,7 +195,13 @@ void G5R5(Lattice &z,const Lattice &x) uint64_t ss = sss*Ls; for(int s=0;s &z, const Lattice &x) z.Checkerboard() = x.Checkerboard(); conformable(x, z); - Gamma G5(Gamma::Algebra::Gamma5); - z = G5 * x; + autoView( x_v, x, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); + uint64_t nloop = grid->oSites(); + accelerator_for(ss,nloop,vobj::Nsimd(),{ + auto tmp = x_v(ss); + decltype(tmp) tmp_p; + decltype(tmp) tmp_m; + spProj5p(tmp_p,tmp); + spProj5m(tmp_m,tmp); + coalescedWrite(z_v[ss],tmp_p - tmp_m); + }); } +/* template void G5C(Lattice> &z, const Lattice> &x) { @@ -234,6 +249,7 @@ void G5C(Lattice> &z, const Lattice Date: Thu, 14 Jan 2021 20:48:08 -0500 Subject: [PATCH 105/201] Red black support on coars --- Grid/algorithms/CoarsenedMatrix.h | 60 +++++++++++++++++++------------ 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 66b9c169..b9594678 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -775,7 +775,26 @@ public: for(int p=0;p FineComplexField; typedef typename Fobj::scalar_type scalar_type; + std::cout << GridLogMessage<< "CoarsenMatrix "<< std::endl; + FineComplexField one(FineGrid); one=scalar_type(1.0,0.0); FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0); @@ -847,11 +868,13 @@ public: CoarseScalar InnerProd(Grid()); + std::cout << GridLogMessage<< "CoarsenMatrix Orthog "<< std::endl; // Orthogonalise the subblocks over the basis blockOrthogonalise(InnerProd,Subspace.subspace); // Compute the matrix elements of linop between this orthonormal // set of vectors. + std::cout << GridLogMessage<< "CoarsenMatrix masks "<< std::endl; int self_stencil=-1; for(int p=0;poSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); + if ( hermitian && (disp==-1) ) { + for(int pp=0;pp = * + int dirp = geom.directions[pp]; + int dispp = geom.displacements[pp]; + if ( (dirp==dir) && (dispp==1) ){ + auto sft = conjugate(Cshift(oZProj,dir,1)); + autoView( sft_v , sft , AcceleratorWrite); + autoView( A_pp , A[pp], AcceleratorWrite); + accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); }); + } + } + } } } @@ -957,33 +992,12 @@ public: } if(hermitian) { std::cout << GridLogMessage << " ForceHermitian, new code "<lSites(); From 579595f547bd36775ba42ecd07f9a881e4f12e85 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 14 Jan 2021 20:48:35 -0500 Subject: [PATCH 106/201] Red black on coarse space --- tests/solver/Test_dwf_hdcr_2level.cc | 8 ++++++-- tests/solver/Test_dwf_multigrid.cc | 9 +++++++-- tests/solver/Test_hw_multigrid.cc | 4 +++- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/tests/solver/Test_dwf_hdcr_2level.cc b/tests/solver/Test_dwf_hdcr_2level.cc index df24c9d2..4fa1e302 100644 --- a/tests/solver/Test_dwf_hdcr_2level.cc +++ b/tests/solver/Test_dwf_hdcr_2level.cc @@ -262,6 +262,8 @@ int main (int argc, char ** argv) GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());; GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d); + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d); std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); @@ -328,7 +330,7 @@ int main (int argc, char ** argv) Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); - Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); + Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); std::cout< CoarseCG(0.01,1000); - ConjugateGradient CoarseCG(0.02,1000);// 14.7s + ConjugateGradient CoarseCG(0.01,2000);// 14.7s + eval.resize(0); + evec.resize(0,Coarse5d); DeflatedGuesser DeflCoarseGuesser(evec,eval); NormalEquations DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser); diff --git a/tests/solver/Test_dwf_multigrid.cc b/tests/solver/Test_dwf_multigrid.cc index 9e11c160..351e10fd 100644 --- a/tests/solver/Test_dwf_multigrid.cc +++ b/tests/solver/Test_dwf_multigrid.cc @@ -370,6 +370,11 @@ int main (int argc, char ** argv) GridCartesian *CoarseCoarse4d = SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());; GridCartesian *CoarseCoarse5d = SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d); + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d); + GridRedBlackCartesian *CoarseCoarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseCoarse4d); + GridRedBlackCartesian *CoarseCoarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,CoarseCoarse4d); + std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); std::vector cseeds({5,6,7,8}); @@ -434,8 +439,8 @@ int main (int argc, char ** argv) std::cout< seeds({1,2,3,4}); GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds); @@ -335,7 +337,7 @@ int main (int argc, char ** argv) NonHermitianLinearOperator LinOpDwf(Ddwf); - Level1Op LDOp (*Coarse5d,0); + Level1Op LDOp (*Coarse5d,*Coarse5dRB,0); std::cout< Date: Thu, 14 Jan 2021 20:49:13 -0500 Subject: [PATCH 107/201] Red black coarse space --- tests/solver/Test_dwf_hdcr_16_rb.cc | 397 +++++ tests/solver/Test_dwf_hdcr_24_regression.cc | 477 ++++++ tests/solver/Test_dwf_hdcr_48_rb.cc | 397 +++++ tests/solver/Test_dwf_hdcr_48_regression.cc | 473 ++++++ tests/solver/Test_hw_multigrid_mixed_48.cc | 1287 ++++++++++++++++ tests/solver/Test_hw_multigrid_mixed_48_rb.cc | 1326 +++++++++++++++++ 6 files changed, 4357 insertions(+) create mode 100644 tests/solver/Test_dwf_hdcr_16_rb.cc create mode 100644 tests/solver/Test_dwf_hdcr_24_regression.cc create mode 100644 tests/solver/Test_dwf_hdcr_48_rb.cc create mode 100644 tests/solver/Test_dwf_hdcr_48_regression.cc create mode 100644 tests/solver/Test_hw_multigrid_mixed_48.cc create mode 100644 tests/solver/Test_hw_multigrid_mixed_48_rb.cc diff --git a/tests/solver/Test_dwf_hdcr_16_rb.cc b/tests/solver/Test_dwf_hdcr_16_rb.cc new file mode 100644 index 00000000..b7900b04 --- /dev/null +++ b/tests/solver/Test_dwf_hdcr_16_rb.cc @@ -0,0 +1,397 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_hdcr.cc + + Copyright (C) 2015 + +Author: Antonin Portelli +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class SolverWrapper : public LinearFunction { +private: + CheckerBoardedSparseMatrixBase & _Matrix; + SchurRedBlackBase & _Solver; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(CheckerBoardedSparseMatrixBase &Matrix, + SchurRedBlackBase &Solver) + : _Matrix(Matrix), _Solver(Solver) {}; + + void operator() (const Field &in, Field &out){ + + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +template +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + Matrix & _FineMatrix; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < block ({2,2,2,2}); + std::vector blockc ({2,2,2,2}); + const int nbasis= 32; + const int nbasisc= 32; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + std::string file("./ckpoint_lat.4000"); + //std::string file("./ckpoint_lat.1000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + LatticeFermion A(FGrid); + LatticeFermion B(FGrid); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,150,0.0);// + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,1000,100,100,0.0); // Slightly faster + + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + std::cout << " Making 5D coarse RB grid " <,nbasisc> CoarseSubspace; + // CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< PosdefLdop(LDOp); + typedef Level2Op::CoarseVector CoarseCoarseVector; + CoarseVector c_src(Coarse5d); c_src=1.0; + + std::cout< , SolverWrapper > TwoLevelMG; + typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + + ChebyshevSmoother FineSmoother(0.5,60.0,12,HermIndefOp,Ddwf); + std::cout< CoarseZeroGuesser; + ConjugateGradient CoarseCG(0.005,1000); + // SchurDiagMooeeOperator CoarseMpcDagMpc(LDOp); + SchurRedBlackDiagMooeeSolve CoarseRBCG(CoarseCG); + SolverWrapper CoarseSolver(LDOp,CoarseRBCG); + + // NormalEquations CoarseCGNE(LDOp,CoarseCG,CoarseZeroGuesser); + TwoLevelMG TwoLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + CoarseSolver); + TwoLevelPrecon.Level(1); + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + l1PGCR(src,result); + + std::cout< pCG(1.0e-8,60000); + result=Zero(); + // pCG(HermDefOp,src,result); + + std::cout< HermOpEO(Ddwf); + // pCG(HermOpEO,src_o,result_o); + + std::cout< PM; PM(HermDefOp,src); + std::cout< cPM; cPM(PosdefLdop,c_src); + // std::cout< ccPM; ccPM(IRLHermOpL2,cc_src); + + std::cout< +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +template +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + Matrix & _FineMatrix; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < block ({2,2,2,2}); + std::vector blockc ({2,2,2,2}); + const int nbasis= 40; + const int nbasisc= 40; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + // std::string file("./ckpoint_lat.4000"); + // std::string file("./ckpoint_lat.1000"); + // NerscIO::readConfiguration(Umu,header,file); + SU::HotConfiguration(RNG4,Umu); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + LatticeFermion A(FGrid); + LatticeFermion B(FGrid); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,400,50,50,0.0); // Slightly faster + + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + std::cout << " Making 5D coarse RB grid " <,nbasisc> CoarseSubspace; + // CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< PosdefLdop(LDOp); + /* + { + int nb=nbasisc/2; + CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,15.0,0.02,1000,800,100,0.0); + for(int n=0;noSites();site++){ + subspace_g5[site](nn) = subspace[site](nn); + subspace_g5[site](nn+nb)=-subspace[site](nn+nb); + } + } + } + } + */ + typedef Level2Op::CoarseVector CoarseCoarseVector; + /* + Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix + HermitianLinearOperator L1LinOp(LDOp); + L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates); + + + std::cout< IRLHermOpL2(L2Op); + CoarseCoarseVector cc_src(CoarseCoarse5d); cc_src=1.0; + */ + /* + Chebyshev IRLChebyL2(0.001,15.0,301); + FunctionHermOp IRLOpChebyL2(IRLChebyL2,IRLHermOpL2); + PlainHermOp IRLOpL2 (IRLHermOpL2); + int cNk=24; + int cNm=36; + int cNstop=24; + ImplicitlyRestartedLanczos IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20); + + int cNconv; + std::vector eval2(cNm); + std::vector evec2(cNm,CoarseCoarse5d); + IRLL2.calc(eval2,evec2,cc_src,cNconv); + + ConjugateGradient CoarseCoarseCG(0.1,1000); + DeflatedGuesser DeflCoarseCoarseGuesser(evec2,eval2); + NormalEquations DeflCoarseCoarseCGNE(L2Op,CoarseCoarseCG,DeflCoarseCoarseGuesser); + */ + + /* + std::cout< IRLHermOp(LDOp); + // Chebyshev IRLCheby(0.001,15.0,301); + Chebyshev IRLCheby(0.03,12.0,101); + FunctionHermOp IRLOpCheby(IRLCheby,IRLHermOp); + PlainHermOp IRLOp (IRLHermOp); + int Nk=64; + int Nm=128; + int Nstop=Nk; + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + IRL.calc(eval,evec,c_src,Nconv); + */ + CoarseVector c_src(Coarse5d); c_src=1.0; + // DeflatedGuesser DeflCoarseGuesser(evec,eval); + // NormalEquations DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser); + + std::cout< , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + + ChebyshevSmoother FineSmoother(0.25,60.0,12,HermIndefOp,Ddwf); + /* + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + ChebyshevSmoother CoarseSmoother(0.1,15.0,3,L1LinOp,LDOp); + + // MirsSmoother CoarseCGSmoother(0.1,0.1,4,L1LinOp,LDOp); + // MirsSmoother FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf); + + CoarseMG Level2Precon (CoarseAggregates, L2Op, + L1LinOp,LDOp, + CoarseSmoother, + DeflCoarseCoarseGuesser, + DeflCoarseCoarseCGNE); + Level2Precon.Level(2); + + // PGCR Applying this solver to solve the coarse space problem + PrecGeneralisedConjugateResidual l2PGCR(0.1, 100, L1LinOp,Level2Precon,16,16); + l2PGCR.Level(2); + + // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space + ZeroGuesser CoarseZeroGuesser; + ThreeLevelMG ThreeLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + l2PGCR); + ThreeLevelPrecon.Level(1); + + // Apply the fine-coarse-coarsecoarse 2 deep MG preconditioner in an outer PGCR on the fine fgrid + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,1000,HermIndefOp,ThreeLevelPrecon,16,16); + l1PGCR.Level(1); + */ + std::cout< CoarseZeroGuesser; + ConjugateGradient CoarseCG(0.01,1000); + NormalEquations CoarseCGNE(LDOp,CoarseCG,CoarseZeroGuesser); + TwoLevelMG TwoLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + CoarseCGNE); + TwoLevelPrecon.Level(1); + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + l1PGCR(src,result); + + std::cout< pCG(1.0e-8,60000); + result=Zero(); + // pCG(HermDefOp,src,result); + + std::cout< HermOpEO(Ddwf); + // pCG(HermOpEO,src_o,result_o); + + std::cout< PM; PM(HermDefOp,src); + std::cout< cPM; cPM(PosdefLdop,c_src); + // std::cout< ccPM; ccPM(IRLHermOpL2,cc_src); + + std::cout< +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class SolverWrapper : public LinearFunction { +private: + CheckerBoardedSparseMatrixBase & _Matrix; + SchurRedBlackBase & _Solver; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(CheckerBoardedSparseMatrixBase &Matrix, + SchurRedBlackBase &Solver) + : _Matrix(Matrix), _Solver(Solver) {}; + + void operator() (const Field &in, Field &out){ + + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +template +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + Matrix & _FineMatrix; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < block ({2,2,2,2}); + //std::vector block ({2,2,2,2}); + const int nbasis= 40; + const int nbasisc= 40; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + //std::string file("./ckpoint_lat.4000"); + std::string file("./ckpoint_lat.1000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + LatticeFermion A(FGrid); + LatticeFermion B(FGrid); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,1000,100,100,0.0); // Slightly faster + + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d); + + Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); + + ////////////////////////////////////////////////// + // Deflate the course space. Recursive multigrid? + ////////////////////////////////////////////////// + typedef Aggregation,nbasisc> CoarseSubspace; + // CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< PosdefLdop(LDOp); + typedef Level2Op::CoarseVector CoarseCoarseVector; + CoarseVector c_src(Coarse5d); c_src=1.0; + + std::cout< , SolverWrapper > TwoLevelMG; + typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + + std::cout< tols({0.015}); + std::vector ords({12}); + std::vector los({0.8}); + for(int l=0;l FineSmoother(los[l],60.0,ords[o],HermIndefOp,Ddwf); + ZeroGuesser CoarseZeroGuesser; + ConjugateGradient CoarseCG(tols[t],10000); + SchurRedBlackDiagMooeeSolve CoarseRBCG(CoarseCG); + SolverWrapper CoarseSolver(LDOp,CoarseRBCG); + + TwoLevelMG TwoLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + CoarseSolver); + TwoLevelPrecon.Level(1); + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + l1PGCR(src,result); + }}} + + ConjugateGradient pCG(1.0e-8,60000); + std::cout< HermOpEO(Ddwf); + pCG(HermOpEO,src_o,result_o); + + std::cout< PM; PM(HermDefOp,src); + std::cout< cPM; cPM(PosdefLdop,c_src); + // std::cout< ccPM; ccPM(IRLHermOpL2,cc_src); + + std::cout< +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +template +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + Matrix & _FineMatrix; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < block ({2,2,2,2}); + std::vector blockc ({2,2,2,2}); + const int nbasis= 40; + const int nbasisc= 40; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + // std::string file("./ckpoint_lat.4000"); + std::string file("./ckpoint_lat.1000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + LatticeFermion A(FGrid); + LatticeFermion B(FGrid); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,1000,100,100,0.0); // Slightly faster + + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + std::cout << " Making 5D coarse RB grid " <,nbasisc> CoarseSubspace; + // CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< PosdefLdop(LDOp); + /* + { + int nb=nbasisc/2; + CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,15.0,0.02,1000,800,100,0.0); + for(int n=0;noSites();site++){ + subspace_g5[site](nn) = subspace[site](nn); + subspace_g5[site](nn+nb)=-subspace[site](nn+nb); + } + } + } + } + */ + typedef Level2Op::CoarseVector CoarseCoarseVector; + /* + Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix + HermitianLinearOperator L1LinOp(LDOp); + L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates); + + + std::cout< IRLHermOpL2(L2Op); + CoarseCoarseVector cc_src(CoarseCoarse5d); cc_src=1.0; + */ + /* + Chebyshev IRLChebyL2(0.001,15.0,301); + FunctionHermOp IRLOpChebyL2(IRLChebyL2,IRLHermOpL2); + PlainHermOp IRLOpL2 (IRLHermOpL2); + int cNk=24; + int cNm=36; + int cNstop=24; + ImplicitlyRestartedLanczos IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20); + + int cNconv; + std::vector eval2(cNm); + std::vector evec2(cNm,CoarseCoarse5d); + IRLL2.calc(eval2,evec2,cc_src,cNconv); + + ConjugateGradient CoarseCoarseCG(0.1,1000); + DeflatedGuesser DeflCoarseCoarseGuesser(evec2,eval2); + NormalEquations DeflCoarseCoarseCGNE(L2Op,CoarseCoarseCG,DeflCoarseCoarseGuesser); + */ + + /* + std::cout< IRLHermOp(LDOp); + // Chebyshev IRLCheby(0.001,15.0,301); + Chebyshev IRLCheby(0.03,12.0,101); + FunctionHermOp IRLOpCheby(IRLCheby,IRLHermOp); + PlainHermOp IRLOp (IRLHermOp); + int Nk=64; + int Nm=128; + int Nstop=Nk; + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + IRL.calc(eval,evec,c_src,Nconv); + */ + CoarseVector c_src(Coarse5d); c_src=1.0; + // DeflatedGuesser DeflCoarseGuesser(evec,eval); + // NormalEquations DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser); + + std::cout< , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + + ChebyshevSmoother FineSmoother(0.25,60.0,12,HermIndefOp,Ddwf); + /* + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + ChebyshevSmoother CoarseSmoother(0.1,15.0,3,L1LinOp,LDOp); + + // MirsSmoother CoarseCGSmoother(0.1,0.1,4,L1LinOp,LDOp); + // MirsSmoother FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf); + + CoarseMG Level2Precon (CoarseAggregates, L2Op, + L1LinOp,LDOp, + CoarseSmoother, + DeflCoarseCoarseGuesser, + DeflCoarseCoarseCGNE); + Level2Precon.Level(2); + + // PGCR Applying this solver to solve the coarse space problem + PrecGeneralisedConjugateResidual l2PGCR(0.1, 100, L1LinOp,Level2Precon,16,16); + l2PGCR.Level(2); + + // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space + ZeroGuesser CoarseZeroGuesser; + ThreeLevelMG ThreeLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + l2PGCR); + ThreeLevelPrecon.Level(1); + + // Apply the fine-coarse-coarsecoarse 2 deep MG preconditioner in an outer PGCR on the fine fgrid + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,1000,HermIndefOp,ThreeLevelPrecon,16,16); + l1PGCR.Level(1); + */ + std::cout< CoarseZeroGuesser; + ConjugateGradient CoarseCG(0.01,1000); + NormalEquations CoarseCGNE(LDOp,CoarseCG,CoarseZeroGuesser); + TwoLevelMG TwoLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + CoarseCGNE); + TwoLevelPrecon.Level(1); + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + l1PGCR(src,result); + + std::cout< pCG(1.0e-8,60000); + result=Zero(); + // pCG(HermDefOp,src,result); + + std::cout< HermOpEO(Ddwf); + pCG(HermOpEO,src_o,result_o); + + std::cout< PM; PM(HermDefOp,src); + std::cout< cPM; cPM(PosdefLdop,c_src); + // std::cout< ccPM; ccPM(IRLHermOpL2,cc_src); + + std::cout< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +using namespace std; +using namespace Grid; + +// TODO +// +// Coarse Grid axpby_ssp_pminus // Inherit from spProj5pm +// Coarse Grid axpby_ssp_pplus + +template +class CayleyBase : public SparseMatrixBase +{ +public: + int Ls; + // protected: + RealD mass; + RealD M5; + // Save arguments to SetCoefficientsInternal + Vector _gamma; + RealD _zolo_hi; + RealD _b; + RealD _c; + + // Cayley form Moebius (tanh and zolotarev) + Vector omega; + Vector bs; // S dependent coeffs + Vector cs; + Vector as; + // For preconditioning Cayley form + Vector bee; + Vector cee; + Vector aee; + Vector beo; + Vector ceo; + Vector aeo; + // LDU factorisation of the eeoo matrix + Vector lee; + Vector leem; + Vector uee; + Vector ueem; + Vector dee; +public: + CayleyBase(RealD _M5, RealD _mass, int _Ls, RealD b_, RealD c_) : + M5(_M5), + mass(_mass), + Ls(_Ls), + _b(b_), + _c(c_) + { + RealD eps = 1.0; + Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham + this->SetCoefficientsTanh(zdata,1.0,0.0); + Approx::zolotarev_free(zdata); + } + ///////////////////////////////////////////////////////// + // Replicates functionality + // Use a common base class approach + ///////////////////////////////////////////////////////// + // Tanh + void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c) + { + Vector gamma(this->Ls); + for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; + SetCoefficientsInternal(1.0,gamma,b,c); + } + //Zolo + void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c) + { + Vector gamma(this->Ls); + for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; + SetCoefficientsInternal(zolo_hi,gamma,b,c); + } + //Zolo + void SetCoefficientsInternal(RealD zolo_hi,Vector & gamma,RealD b,RealD c) + { + int Ls=this->Ls; + + /////////////////////////////////////////////////////////// + // The Cayley coeffs (unprec) + /////////////////////////////////////////////////////////// + assert(gamma.size()==Ls); + + omega.resize(Ls); + bs.resize(Ls); + cs.resize(Ls); + as.resize(Ls); + + double bpc = b+c; + double bmc = b-c; + _b = b; + _c = c; + _gamma = gamma; // Save the parameters so we can change mass later. + _zolo_hi= zolo_hi; + for(int i=0; i < Ls; i++){ + as[i] = 1.0; + omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code + assert(omega[i]!=Coeff_t(0.0)); + bs[i] = 0.5*(bpc/omega[i] + bmc); + cs[i] = 0.5*(bpc/omega[i] - bmc); + } + + //////////////////////////////////////////////////////// + // Constants for the preconditioned matrix Cayley form + //////////////////////////////////////////////////////// + bee.resize(Ls); + cee.resize(Ls); + beo.resize(Ls); + ceo.resize(Ls); + + for(int i=0;iM5) +1.0); + assert(bee[i]!=Coeff_t(0.0)); + cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); + beo[i]=as[i]*bs[i]; + ceo[i]=-as[i]*cs[i]; + } + aee.resize(Ls); + aeo.resize(Ls); + for(int i=0;i &out){assert(0);}; + virtual void DW (const Field &psi, Field &chi)=0; + virtual void DWDag (const Field &psi, Field &chi)=0; + + void M (const Field &psi, Field &chi) + { + Field Din(psi.Grid()); + Meooe5D(psi,Din); + DW(Din,chi); + axpby(chi,1.0,1.0,chi,psi); + M5D(psi,chi); + } + void Mdag (const Field &psi, Field &chi) + { + Field Din(psi.Grid()); + DWDag(psi,Din); + MeooeDag5D(Din,chi); + M5Ddag(psi,chi); + axpby (chi,1.0,1.0,chi,psi); + } + ///////////////////////////////// + // P and Pdag - might be needed + ///////////////////////////////// + void P(const Field &psi, Field &chi) + { + int Ls= this->Ls; + chi=Zero(); + for(int s=0;sLs; + chi=Zero(); + for(int s=0;sLs; + Vector diag (Ls,1.0); + Vector upper(Ls,-1.0); upper[Ls-1]=mass; + Vector lower(Ls,-1.0); lower[0] =mass; + M5D(psi,chi,chi,lower,diag,upper); + } + void M5Ddag (const Field &psi, Field &chi) + { + int Ls=this->Ls; + Vector diag(Ls,1.0); + Vector upper(Ls,-1.0); + Vector lower(Ls,-1.0); + upper[Ls-1]=-mass*upper[Ls-1]; + lower[0] =-mass*lower[0]; + M5Ddag(psi,chi,chi,lower,diag,upper); + } + void Meooe5D (const Field &psi, Field &Din) + { + int Ls=this->Ls; + Vector diag = bs; + Vector upper= cs; + Vector lower= cs; + upper[Ls-1]=-mass*upper[Ls-1]; + lower[0] =-mass*lower[0]; + M5D(psi,psi,Din,lower,diag,upper); + } + void MeooeDag5D (const Field &psi, Field &Din) + { + int Ls=this->Ls; + Vector diag =bs; + Vector upper=cs; + Vector lower=cs; + + for (int s=0;s &lower, + Vector &diag, + Vector &upper) + { + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); + assert(phi.Checkerboard() == psi.Checkerboard()); + + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + + int Ls =this->Ls; + + // 10 = 3 complex mult + 2 complex add + // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) + uint64_t nloop = grid->oSites()/Ls; + + const int Nsimd = Field::vector_type::Nsimd(); + accelerator_for(sss,nloop,Nsimd,{ + uint64_t ss= sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp1, tmp2; + for(int s=0;s &lower, + Vector &diag, + Vector &upper) + { + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); + assert(phi.Checkerboard() == psi.Checkerboard()); + + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + + int Ls=this->Ls; + + uint64_t nloop = grid->oSites()/Ls; + const int Nsimd = Field::vector_type::Nsimd(); + accelerator_for(sss,nloop,Nsimd,{ + uint64_t ss=sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp1,tmp2; + for(int s=0;s +class CoarseCayleyFermion : public CayleyBase< Lattice > , ComplexD > +{ +public: + typedef iVector siteVector; + typedef Lattice CoarseComplexField; + typedef Lattice CoarseVector; + typedef Lattice > CoarseMatrix; + typedef iMatrix Cobj; + typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + // Similar to the CoarseOperator but add 5D support. + Geometry geom; + GridBase *Coarse5D; + GridBase *Coarse4D; + CartesianStencil Stencil; + CoarsenedMatrix &Dw; + + GridBase * Grid(void) { return Coarse5D; }; // this is all the linalg routines need to know + + CoarseCayleyFermion(GridCartesian &CoarseGrid4, + GridCartesian &CoarseGrid5, + CoarsenedMatrix &_Dw, + RealD M5, RealD mass, int Ls, RealD b, RealD c) : + CayleyBase(M5,mass,Ls,b,c), + Coarse4D(&CoarseGrid4), + Coarse5D(&CoarseGrid5), + Dw(_Dw), + geom(CoarseGrid5._ndimension), + Stencil( &CoarseGrid5,geom.npoint,Even,geom.directions,geom.displacements,0) + { + }; + +public: + void Project( CoarseVector &C ) + { + const int Nsimd = CComplex::Nsimd(); + autoView(Cv,C, AcceleratorWrite); + int Ls = this->Ls; + for(int s=0;soSites(), Nsimd, { + int sF= sU*Ls+s; + auto tmp = coalescedRead(Cv[sF]); + coalescedWrite(Cv[sF],tmp); + }); + } + } + //////////////////////////////////////////////// + // This is specific to Coarse Grid Cayley + //////////////////////////////////////////////// + virtual void Mdiag (const CoarseVector &in, CoarseVector &out) + { + std::vector allout(9,in.Grid()); + this->MdirAll(in,allout); + out = allout[8]; + } + virtual void Mdir (const CoarseVector &in, CoarseVector &out,int dir, int disp) + { + assert(0); + } + virtual void MdirAll (const CoarseVector &in, std::vector &out) + { + conformable(Coarse5D,in.Grid()); + + SimpleCompressor compressor; + + Stencil.HaloExchange(in,compressor); + typedef LatticeView Aview; + + const int Nsimd = CComplex::Nsimd(); + + // Ls loop for2D + int Ls=this->Ls; + + siteVector *CBp=Stencil.CommBuf(); + + int ptype; + int nb2=nbasis/2; + + autoView(in_v , in, AcceleratorRead); + autoView(st, Stencil, AcceleratorRead); + for(int point=0;pointoSites(), b, nbasis, Nsimd, { + + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + int sU = sF/Ls; + int s = sF%Ls; + + calcComplex res = Zero(); + calcVector nbr; + int ptype; + + StencilEntry *SE=st.GetEntry(ptype,point,sF); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(CBp[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb compressor; + + Stencil.HaloExchange(in,compressor); + typedef LatticeView Aview; + + const int Nsimd = CComplex::Nsimd(); + + // Ls loop for2D + int Ls=this->Ls; + + Vector AcceleratorViewContainer; + for(int p=0;poSites(), b, nbasis, Nsimd, { + + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + int sU = sF/Ls; + int s = sF%Ls; + + calcComplex res = Zero(); + + { + calcVector nbr; + int ptype; + + for(int point=0;point_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(CBp[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb Aggregates; + + void PromoteFromSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) + { + auto FineGrid4 = _Aggregates.FineGrid; + FineField F4(FineGrid4); + CoarseVector C4(Coarse4D); + for(int s=0;sLs;s++){ + ExtractSlice(C4,C,s,0); + _Aggregates.PromoteFromSubspace(C4,F4); + InsertSlice(F4,F,s,0); + } + } + void ProjectToSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) + { + auto FineGrid4 = _Aggregates.FineGrid; + FineField F4(FineGrid4); + CoarseVector C4(Coarse4D); + for(int s=0;sLs;s++){ + ExtractSlice(F4,F,s,0); + _Aggregates.ProjectToSubspace (C4,F4); + InsertSlice(C4,C,s,0); + } + Project(C); + } + template + void Test(Aggregates &_Aggregates,GridBase *FineGrid, Ddwf &_Ddwf) + { + typedef Lattice FineField; + CoarseVector Cin(Coarse5D); + CoarseVector Cout(Coarse5D); + CoarseVector CFout(Coarse5D); + + FineField Fin(FineGrid); + FineField Fout(FineGrid); + + + std::vector seeds({1,2,3,4,5}); + GridParallelRNG RNG(Coarse5D); RNG.SeedFixedIntegers(seeds); + + gaussian(RNG,Cin); + PromoteFromSubspace(_Aggregates,Cin,Fin); + ProjectToSubspace(_Aggregates,Cin,Fin); + + std::cout << GridLogMessage<< "************ "<M(Cin,Cout); + this->Project(Cout); + std::cout << GridLogMessage<< " Cout "<Mdag(Cin,Cout); + this->Project(Cout); + std::cout << GridLogMessage<< " Cout "< Directions(void) { return geom.directions;}; + virtual std::vector Displacements(void){ return geom.displacements;}; +}; + + +template class SolverWrapper : public LinearFunction { +private: + LinearOperatorBase & _Matrix; + OperatorFunction & _Solver; + LinearFunction & _Guess; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(LinearOperatorBase &Matrix, + OperatorFunction &Solver, + LinearFunction &Guess) + : _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; + + void operator() (const Field &in, Field &out){ + + _Guess(in,out); + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +// Must use a non-hermitian solver +template +class PVdagMLinearOperator : public LinearOperatorBase { + Matrix &_Mat; + Matrix &_PV; +public: + PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){}; + + virtual std::vector Directions(void) { return _Mat.Directions();}; + virtual std::vector Displacements(void){ return _Mat.Displacements();}; + + void OpDiag (const Field &in, Field &out) { + assert(0); + } + void OpDir (const Field &in, Field &out,int dir,int disp) { + assert(0); + } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; + void Op (const Field &in, Field &out){ + Field tmp(in.Grid()); + _Mat.M(in,tmp); + _PV.Mdag(tmp,out); + } + void AdjOp (const Field &in, Field &out){ + Field tmp(in.Grid()); + _PV.M(tmp,out); + _Mat.Mdag(in,tmp); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + assert(0); + } + void HermOp(const Field &in, Field &out){ + assert(0); + } +}; + +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template +class MGPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + typedef CoarseCayleyFermion CoarseOperator; + // typedef SparseMatrixBase CoarseOperator; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _PreSmoother; + FineSmoother & _PostSmoother; + CoarseOperator & _CoarseOperator; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + MGPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &PreSmoother, + FineSmoother &PostSmoother, + CoarseOperator &CoarseOperator_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _PreSmoother(PreSmoother), + _PostSmoother(PostSmoother), + _CoarseOperator(CoarseOperator_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _CoarseOperator.Grid(); + CoarseVector Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + std::cout< +class HDCRPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + //typedef CoarseCayleyFermion CoarseOperator; + typedef SparseMatrixBase CoarseOperator; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _PreSmoother; + FineSmoother & _PostSmoother; + CoarseOperator & _CoarseOperator; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + HDCRPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &PreSmoother, + FineSmoother &PostSmoother, + CoarseOperator &CoarseOperator_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _PreSmoother(PreSmoother), + _PostSmoother(PostSmoother), + _CoarseOperator(CoarseOperator_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _CoarseOperator.Grid(); + CoarseVector Csrc(CoarseGrid); + CoarseVector g5Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + std::cout< block ({2,2,2,2}); // 4,2,2,2 gets worse + std::vector blockc ({1,1,1,1}); + const int nbasis= 24; + const int nbasisc= 32; // decrease, not improvement + + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds({1,2,3,4}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds); + GridParallelRNG CRNG(Coarse4d);CRNG.SeedFixedIntegers(seeds); + + LatticeGaugeField Umu(UGrid); +#if 0 + SU3::TepidConfiguration(RNG4,Umu); + RealD M5=1.0; +#else + std::string file("./ckpoint_lat.1000"); + FieldMetaData header; + NerscIO::readConfiguration(Umu,header,file); + RealD M5=1.8; +#endif + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + + std::cout< MdagM_Dw(Dw_null); + + std::cout< WilsonCG(1.0e-10,40000); + LatticeFermion w_src(UGrid); w_src=1.0; + LatticeFermion w_res(UGrid); + WilsonCG(MdagM_Dw,w_src,w_res); + exit(0); + */ + std::cout< Level1Op4; + typedef CoarseCayleyFermion Level1Op5; + Level1Op4 c_Dw (*Coarse4d,0); + NonHermitianLinearOperator LinOpDw(Dw); + c_Dw.CoarsenOperator(UGrid,LinOpDw,Aggregates4D); // contains the M5 from Dw(-M5) + // c_Dw.Test(Aggregates4D,UGrid,LinOpDw); + + std::cout< MdagM_cDwf(c_Dwf); + + std::cout<,nbasisc> Level2Op; + typedef Aggregation,nbasisc> CoarseSubspace; + CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< L1Hdwf(c_Dwf); + GridRedBlackCartesian * CoarseCoarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,CoarseCoarse4d); + Level2Op cc_Dwf (*CoarseCoarse5d,*CoarseCoarse5dRB,1); // say it is hermitian + cc_Dwf.CoarsenOperator(Coarse5d,L1Hdwf,CoarseAggregates); + // cc_Dwf.Test(CoarseAggregates,Coarse5d,L1Hdwf); + + typedef Level2Op::CoarseVector CoarseCoarseVector; + + std::cout< CoarseCG(tol,MaxIt); + ConjugateGradient FineCG(tol,MaxIt); + + NonHermitianLinearOperator FineM(Ddwf); + MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M + + NonHermitianLinearOperator CoarseM(c_Dwf); + MdagMLinearOperator CoarseMdagM(c_Dwf); + + NonHermitianLinearOperator CoarseCoarseM(cc_Dwf); + MdagMLinearOperator CoarseCoarseMdagM(cc_Dwf); + + + std::cout< PM; PM(MdagM_Dw,w_src); + std::cout< cPM; cPM(CoarseMdagM,c_src); + + cc_src=1.0; + PowerMethod ccPM; ccPM(CoarseCoarseMdagM,cc_src); + + std::cout< IRLHermOpL2(cc_Dwf); + Chebyshev IRLChebyL2(IRL_lo,IRL_hi,IRL_ord); + FunctionHermOp IRLOpChebyL2(IRLChebyL2,IRLHermOpL2); + PlainHermOp IRLOpL2 (IRLHermOpL2); + ImplicitlyRestartedLanczos IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20); + + int cNconv; + cNm=0; + std::vector eval2(cNm); + std::vector evec2(cNm,CoarseCoarse5d); + cc_src=1.0; + // IRLL2.calc(eval2,evec2,cc_src,cNconv); + + ConjugateGradient CoarseCoarseCG(0.02,10000); + DeflatedGuesser DeflCoarseCoarseGuesser(evec2,eval2); + NormalEquations DeflCoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,DeflCoarseCoarseGuesser); + + ZeroGuesser CoarseZeroGuesser; + ZeroGuesser CoarseCoarseZeroGuesser; + + std::cout< CoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,CoarseCoarseZeroGuesser); + { +typedef HDCRPreconditioner,nbasisc,NormalEquations > CoarseMG; + typedef MGPreconditioner > ThreeLevelMG; + + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + ChebyshevSmoother CoarseSmoother1(0.5,22.0,12,CoarseM,c_Dwf); // 37s, 26 iter + ChebyshevSmoother CoarseSmoother2(0.5,22.0,12,CoarseM,c_Dwf); + + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,7,CoarseM,c_Dwf); // 38s, 26 iter + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,7,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.4,22.0,7,CoarseM,c_Dwf); // 41s, 27 iter + // ChebyshevSmoother CoarseSmoother2(0.4,22.0,7,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.6,22.0,6,CoarseM,c_Dwf); // 26 iter + // ChebyshevSmoother CoarseSmoother2(0.6,22.0,6,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,5,CoarseM,c_Dwf); // 33 iter, 55s + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,5,CoarseM,c_Dwf); + + + CoarseMG Level2Precon (CoarseAggregates, + CoarseM, + CoarseSmoother1, + CoarseSmoother2, + cc_Dwf, + DeflCoarseCoarseCGNE); + Level2Precon.Level(2); + + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.5, 100, CoarseM,Level2Precon,16,16); // 26 iter, 37s + // PGCR Applying this solver to solve the coarse space problem + // COULD BE FIXED??? + PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2); + + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(1.0, 100, CoarseM,Level2Precon,16,16); // 35 iter, 45s + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.6, 100, CoarseM,Level2Precon,16,16); // 26,38 (diifferene is measurement noise) + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.2, 100, CoarseM,Level2Precon,16,16); // 26 iter, 47s + L2PGCR.Level(2); + + // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space + + // ChebyshevSmoother FineSmoother1(0.5,60.0,14,FineM,Ddwf); // 26 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,14,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 38s + // ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 23 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,20,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,10,FineM,Ddwf);24 iter, 44s + // ChebyshevSmoother FineSmoother2(0.5,60.0,24,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // odd convergence tail at 10^-9 ish + // ChebyshevSmoother FineSmoother2(0.1,60.0,24,FineM,Ddwf); // 33 iter, waas O(10-9 by 26) + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,18,FineM,Ddwf); // + + ChebyshevSmoother FineSmoother1(0.5,60.0,16,FineM,Ddwf); + ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); // + + // ChebyshevSmoother FineSmoother1(0.5,60.0,11,FineM,Ddwf); // 33 iter, 49s + // ChebyshevSmoother FineSmoother2(0.5,60.0,11,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 26 iter, 37s + // ChebyshevSmoother FineSmoother2(0.5,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.4,60.0,12,FineM,Ddwf); // iter 26 no change in final residual + // ChebyshevSmoother FineSmoother2(0.4,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,12,FineM,Ddwf); // 27 iter 39s. + // ChebyshevSmoother FineSmoother2(0.3,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,13,FineM,Ddwf); // 26 iter, but slower + // ChebyshevSmoother FineSmoother2(0.3,60.0,13,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(1.0,60.0,12,FineM,Ddwf); // 34 iter, slower + // ChebyshevSmoother FineSmoother2(1.0,60.0,12,FineM,Ddwf); + + ThreeLevelMG ThreeLevelPrecon(Aggregates4D, + FineM, + FineSmoother1, + FineSmoother2, + c_Dwf, + L2PGCR); + ThreeLevelPrecon.Level(1); + + PrecGeneralisedConjugateResidualNonHermitian L1PGCR(1.0e-8,1000,FineM,ThreeLevelPrecon,16,16); + L1PGCR.Level(1); + + f_res=Zero(); + L1PGCR(f_src,f_res); + } + + std::cout< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +using namespace std; +using namespace Grid; + +// TODO +// +// Coarse Grid axpby_ssp_pminus // Inherit from spProj5pm +// Coarse Grid axpby_ssp_pplus + +template +class CayleyBase : public SparseMatrixBase +{ +public: + int Ls; + // protected: + RealD mass; + RealD M5; + // Save arguments to SetCoefficientsInternal + Vector _gamma; + RealD _zolo_hi; + RealD _b; + RealD _c; + + // Cayley form Moebius (tanh and zolotarev) + Vector omega; + Vector bs; // S dependent coeffs + Vector cs; + Vector as; + // For preconditioning Cayley form + Vector bee; + Vector cee; + Vector aee; + Vector beo; + Vector ceo; + Vector aeo; + // LDU factorisation of the eeoo matrix + Vector lee; + Vector leem; + Vector uee; + Vector ueem; + Vector dee; +public: + CayleyBase(RealD _M5, RealD _mass, int _Ls, RealD b_, RealD c_) : + M5(_M5), + mass(_mass), + Ls(_Ls), + _b(b_), + _c(c_) + { + RealD eps = 1.0; + Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham + this->SetCoefficientsTanh(zdata,1.0,0.0); + Approx::zolotarev_free(zdata); + } + ///////////////////////////////////////////////////////// + // Replicates functionality + // Use a common base class approach + ///////////////////////////////////////////////////////// + // Tanh + void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c) + { + Vector gamma(this->Ls); + for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; + SetCoefficientsInternal(1.0,gamma,b,c); + } + //Zolo + void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c) + { + Vector gamma(this->Ls); + for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; + SetCoefficientsInternal(zolo_hi,gamma,b,c); + } + //Zolo + void SetCoefficientsInternal(RealD zolo_hi,Vector & gamma,RealD b,RealD c) + { + int Ls=this->Ls; + + /////////////////////////////////////////////////////////// + // The Cayley coeffs (unprec) + /////////////////////////////////////////////////////////// + assert(gamma.size()==Ls); + + omega.resize(Ls); + bs.resize(Ls); + cs.resize(Ls); + as.resize(Ls); + + double bpc = b+c; + double bmc = b-c; + _b = b; + _c = c; + _gamma = gamma; // Save the parameters so we can change mass later. + _zolo_hi= zolo_hi; + for(int i=0; i < Ls; i++){ + as[i] = 1.0; + omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code + assert(omega[i]!=Coeff_t(0.0)); + bs[i] = 0.5*(bpc/omega[i] + bmc); + cs[i] = 0.5*(bpc/omega[i] - bmc); + } + + //////////////////////////////////////////////////////// + // Constants for the preconditioned matrix Cayley form + //////////////////////////////////////////////////////// + bee.resize(Ls); + cee.resize(Ls); + beo.resize(Ls); + ceo.resize(Ls); + + for(int i=0;iM5) +1.0); + assert(bee[i]!=Coeff_t(0.0)); + cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); + beo[i]=as[i]*bs[i]; + ceo[i]=-as[i]*cs[i]; + } + aee.resize(Ls); + aeo.resize(Ls); + for(int i=0;i &out){assert(0);}; + virtual void DW (const Field &psi, Field &chi)=0; + virtual void DWDag (const Field &psi, Field &chi)=0; + + void M (const Field &psi, Field &chi) + { + Field Din(psi.Grid()); + Meooe5D(psi,Din); + DW(Din,chi); + axpby(chi,1.0,1.0,chi,psi); + M5D(psi,chi); + } + void Mdag (const Field &psi, Field &chi) + { + Field Din(psi.Grid()); + DWDag(psi,Din); + MeooeDag5D(Din,chi); + M5Ddag(psi,chi); + axpby (chi,1.0,1.0,chi,psi); + } + ///////////////////////////////// + // P and Pdag - might be needed + ///////////////////////////////// + void P(const Field &psi, Field &chi) + { + int Ls= this->Ls; + chi=Zero(); + for(int s=0;sLs; + chi=Zero(); + for(int s=0;sLs; + Vector diag (Ls,1.0); + Vector upper(Ls,-1.0); upper[Ls-1]=mass; + Vector lower(Ls,-1.0); lower[0] =mass; + M5D(psi,chi,chi,lower,diag,upper); + } + void M5Ddag (const Field &psi, Field &chi) + { + int Ls=this->Ls; + Vector diag(Ls,1.0); + Vector upper(Ls,-1.0); + Vector lower(Ls,-1.0); + upper[Ls-1]=-mass*upper[Ls-1]; + lower[0] =-mass*lower[0]; + M5Ddag(psi,chi,chi,lower,diag,upper); + } + void Meooe5D (const Field &psi, Field &Din) + { + int Ls=this->Ls; + Vector diag = bs; + Vector upper= cs; + Vector lower= cs; + upper[Ls-1]=-mass*upper[Ls-1]; + lower[0] =-mass*lower[0]; + M5D(psi,psi,Din,lower,diag,upper); + } + void MeooeDag5D (const Field &psi, Field &Din) + { + int Ls=this->Ls; + Vector diag =bs; + Vector upper=cs; + Vector lower=cs; + + for (int s=0;s &lower, + Vector &diag, + Vector &upper) + { + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); + assert(phi.Checkerboard() == psi.Checkerboard()); + + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + + int Ls =this->Ls; + + // 10 = 3 complex mult + 2 complex add + // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) + uint64_t nloop = grid->oSites()/Ls; + + const int Nsimd = Field::vector_type::Nsimd(); + accelerator_for(sss,nloop,Nsimd,{ + uint64_t ss= sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp1, tmp2; + for(int s=0;s &lower, + Vector &diag, + Vector &upper) + { + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); + assert(phi.Checkerboard() == psi.Checkerboard()); + + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + + int Ls=this->Ls; + + uint64_t nloop = grid->oSites()/Ls; + const int Nsimd = Field::vector_type::Nsimd(); + accelerator_for(sss,nloop,Nsimd,{ + uint64_t ss=sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp1,tmp2; + for(int s=0;s +class CoarseCayleyFermion : public CayleyBase< Lattice > , ComplexD > +{ +public: + typedef iVector siteVector; + typedef Lattice CoarseComplexField; + typedef Lattice CoarseVector; + typedef Lattice > CoarseMatrix; + typedef iMatrix Cobj; + typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + // Similar to the CoarseOperator but add 5D support. + Geometry geom; + GridBase *Coarse5D; + GridBase *Coarse4D; + CartesianStencil Stencil; + CoarsenedMatrix &Dw; + + GridBase * Grid(void) { return Coarse5D; }; // this is all the linalg routines need to know + + CoarseCayleyFermion(GridCartesian &CoarseGrid4, + GridCartesian &CoarseGrid5, + CoarsenedMatrix &_Dw, + RealD M5, RealD mass, int Ls, RealD b, RealD c) : + CayleyBase(M5,mass,Ls,b,c), + Coarse4D(&CoarseGrid4), + Coarse5D(&CoarseGrid5), + Dw(_Dw), + geom(CoarseGrid5._ndimension), + Stencil( &CoarseGrid5,geom.npoint,Even,geom.directions,geom.displacements,0) + { + }; + +public: + void Project( CoarseVector &C ) + { + const int Nsimd = CComplex::Nsimd(); + autoView(Cv,C, AcceleratorWrite); + int Ls = this->Ls; + for(int s=0;soSites(), Nsimd, { + int sF= sU*Ls+s; + auto tmp = coalescedRead(Cv[sF]); + coalescedWrite(Cv[sF],tmp); + }); + } + } + //////////////////////////////////////////////// + // This is specific to Coarse Grid Cayley + //////////////////////////////////////////////// + virtual void Mdiag (const CoarseVector &in, CoarseVector &out) + { + std::vector allout(9,in.Grid()); + this->MdirAll(in,allout); + out = allout[8]; + } + virtual void Mdir (const CoarseVector &in, CoarseVector &out,int dir, int disp) + { + assert(0); + } + virtual void MdirAll (const CoarseVector &in, std::vector &out) + { + conformable(Coarse5D,in.Grid()); + + SimpleCompressor compressor; + + Stencil.HaloExchange(in,compressor); + typedef LatticeView Aview; + + const int Nsimd = CComplex::Nsimd(); + + // Ls loop for2D + int Ls=this->Ls; + + siteVector *CBp=Stencil.CommBuf(); + + int ptype; + int nb2=nbasis/2; + + autoView(in_v , in, AcceleratorRead); + autoView(st, Stencil, AcceleratorRead); + for(int point=0;pointoSites(), b, nbasis, Nsimd, { + + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + int sU = sF/Ls; + int s = sF%Ls; + + calcComplex res = Zero(); + calcVector nbr; + int ptype; + + StencilEntry *SE=st.GetEntry(ptype,point,sF); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(CBp[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb compressor; + + Stencil.HaloExchange(in,compressor); + typedef LatticeView Aview; + + const int Nsimd = CComplex::Nsimd(); + + // Ls loop for2D + int Ls=this->Ls; + + Vector AcceleratorViewContainer; + for(int p=0;poSites(), b, nbasis, Nsimd, { + + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + int sU = sF/Ls; + int s = sF%Ls; + + calcComplex res = Zero(); + + { + calcVector nbr; + int ptype; + + for(int point=0;point_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(CBp[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb Aggregates; + + void PromoteFromSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) + { + auto FineGrid4 = _Aggregates.FineGrid; + FineField F4(FineGrid4); + CoarseVector C4(Coarse4D); + for(int s=0;sLs;s++){ + ExtractSlice(C4,C,s,0); + _Aggregates.PromoteFromSubspace(C4,F4); + InsertSlice(F4,F,s,0); + } + } + void ProjectToSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) + { + auto FineGrid4 = _Aggregates.FineGrid; + FineField F4(FineGrid4); + CoarseVector C4(Coarse4D); + for(int s=0;sLs;s++){ + ExtractSlice(F4,F,s,0); + _Aggregates.ProjectToSubspace (C4,F4); + InsertSlice(C4,C,s,0); + } + Project(C); + } + template + void Test(Aggregates &_Aggregates,GridBase *FineGrid, Ddwf &_Ddwf) + { + typedef Lattice FineField; + CoarseVector Cin(Coarse5D); + CoarseVector Cout(Coarse5D); + CoarseVector CFout(Coarse5D); + + FineField Fin(FineGrid); + FineField Fout(FineGrid); + + + std::vector seeds({1,2,3,4,5}); + GridParallelRNG RNG(Coarse5D); RNG.SeedFixedIntegers(seeds); + + gaussian(RNG,Cin); + PromoteFromSubspace(_Aggregates,Cin,Fin); + ProjectToSubspace(_Aggregates,Cin,Fin); + + std::cout << GridLogMessage<< "************ "<M(Cin,Cout); + this->Project(Cout); + std::cout << GridLogMessage<< " Cout "<Mdag(Cin,Cout); + this->Project(Cout); + std::cout << GridLogMessage<< " Cout "< Directions(void) { return geom.directions;}; + virtual std::vector Displacements(void){ return geom.displacements;}; +}; + +template class SchurSolverWrapper : public LinearFunction { +private: + CheckerBoardedSparseMatrixBase & _Matrix; + SchurRedBlackBase & _Solver; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SchurSolverWrapper(CheckerBoardedSparseMatrixBase &Matrix, + SchurRedBlackBase &Solver) + : _Matrix(Matrix), _Solver(Solver) {}; + + void operator() (const Field &in, Field &out){ + + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +template class SolverWrapper : public LinearFunction { +private: + LinearOperatorBase & _Matrix; + OperatorFunction & _Solver; + LinearFunction & _Guess; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(LinearOperatorBase &Matrix, + OperatorFunction &Solver, + LinearFunction &Guess) + : _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; + + void operator() (const Field &in, Field &out){ + + _Guess(in,out); + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +// Must use a non-hermitian solver +template +class PVdagMLinearOperator : public LinearOperatorBase { + Matrix &_Mat; + Matrix &_PV; +public: + PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){}; + + virtual std::vector Directions(void) { return _Mat.Directions();}; + virtual std::vector Displacements(void){ return _Mat.Displacements();}; + + void OpDiag (const Field &in, Field &out) { + assert(0); + } + void OpDir (const Field &in, Field &out,int dir,int disp) { + assert(0); + } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; + void Op (const Field &in, Field &out){ + Field tmp(in.Grid()); + _Mat.M(in,tmp); + _PV.Mdag(tmp,out); + } + void AdjOp (const Field &in, Field &out){ + Field tmp(in.Grid()); + _PV.M(tmp,out); + _Mat.Mdag(in,tmp); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + assert(0); + } + void HermOp(const Field &in, Field &out){ + assert(0); + } +}; + +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template +class MGPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + typedef CoarseCayleyFermion CoarseOperator; + // typedef SparseMatrixBase CoarseOperator; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _PreSmoother; + FineSmoother & _PostSmoother; + CoarseOperator & _CoarseOperator; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + MGPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &PreSmoother, + FineSmoother &PostSmoother, + CoarseOperator &CoarseOperator_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _PreSmoother(PreSmoother), + _PostSmoother(PostSmoother), + _CoarseOperator(CoarseOperator_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _CoarseOperator.Grid(); + CoarseVector Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + std::cout< +class HDCRPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + //typedef CoarseCayleyFermion CoarseOperator; + typedef SparseMatrixBase CoarseOperator; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _PreSmoother; + FineSmoother & _PostSmoother; + CoarseOperator & _CoarseOperator; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + HDCRPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &PreSmoother, + FineSmoother &PostSmoother, + CoarseOperator &CoarseOperator_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _PreSmoother(PreSmoother), + _PostSmoother(PostSmoother), + _CoarseOperator(CoarseOperator_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _CoarseOperator.Grid(); + CoarseVector Csrc(CoarseGrid); + CoarseVector g5Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + std::cout< block ({2,2,2,2}); // 4,2,2,2 gets worse + std::vector blockc ({1,1,1,1}); + const int nbasis= 24; + const int nbasisc= 40; // decrease, not improvement + + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds({1,2,3,4}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds); + GridParallelRNG CRNG(Coarse4d);CRNG.SeedFixedIntegers(seeds); + + LatticeGaugeField Umu(UGrid); +#if 0 + SU3::TepidConfiguration(RNG4,Umu); + RealD M5=1.0; +#else + std::string file("./ckpoint_lat.1000"); + FieldMetaData header; + NerscIO::readConfiguration(Umu,header,file); + RealD M5=1.8; +#endif + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + + std::cout< MdagM_Dw(Dw_null); + + std::cout< WilsonCG(1.0e-10,40000); + LatticeFermion w_src(UGrid); w_src=1.0; + LatticeFermion w_res(UGrid); + WilsonCG(MdagM_Dw,w_src,w_res); + exit(0); + */ + std::cout< Level1Op4; + typedef CoarseCayleyFermion Level1Op5; + Level1Op4 c_Dw (*Coarse4d,0); + NonHermitianLinearOperator LinOpDw(Dw); + c_Dw.CoarsenOperator(UGrid,LinOpDw,Aggregates4D); // contains the M5 from Dw(-M5) + // c_Dw.Test(Aggregates4D,UGrid,LinOpDw); + + std::cout< MdagM_cDwf(c_Dwf); + + std::cout<,nbasisc> Level2Op; + typedef Aggregation,nbasisc> CoarseSubspace; + CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< L1Hdwf(c_Dwf); + Level2Op cc_Dwf (*CoarseCoarse5d,*CoarseCoarse5dRB,1); // say it is hermitian + cc_Dwf.CoarsenOperator(Coarse5d,L1Hdwf,CoarseAggregates); + // cc_Dwf.Test(CoarseAggregates,Coarse5d,L1Hdwf); + + typedef Level2Op::CoarseVector CoarseCoarseVector; + + std::cout< CoarseCG(tol,MaxIt); + ConjugateGradient FineCG(tol,MaxIt); + + NonHermitianLinearOperator FineM(Ddwf); + MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M + + NonHermitianLinearOperator CoarseM(c_Dwf); + MdagMLinearOperator CoarseMdagM(c_Dwf); + + NonHermitianLinearOperator CoarseCoarseM(cc_Dwf); + MdagMLinearOperator CoarseCoarseMdagM(cc_Dwf); + + + std::cout< PM; PM(MdagM_Dw,w_src); + std::cout< cPM; cPM(CoarseMdagM,c_src); + + cc_src=1.0; + PowerMethod ccPM; ccPM(CoarseCoarseMdagM,cc_src); + + std::cout< IRLHermOpL2(cc_Dwf); + Chebyshev IRLChebyL2(IRL_lo,IRL_hi,IRL_ord); + FunctionHermOp IRLOpChebyL2(IRLChebyL2,IRLHermOpL2); + PlainHermOp IRLOpL2 (IRLHermOpL2); + ImplicitlyRestartedLanczos IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20); + + int cNconv; + cNm=0; + std::vector eval2(cNm); + std::vector evec2(cNm,CoarseCoarse5d); + cc_src=1.0; + // IRLL2.calc(eval2,evec2,cc_src,cNconv); + + std::vector tols ({0.005,0.001}); + std::vector c_los ({0.1,0.05}); + std::vector c_his ({22.0}); + std::vector f_los ({0.5,0.2}); + std::vector f_his ({60.0}); + std::vector ws ({2,3}); + std::vector c_ords ({32,24}); + std::vector f_ords ({20,16}); + + for(auto w : ws ) { + for(auto tol : tols ) { + for(auto f_ord : f_ords ) { + for(auto c_ord : c_ords ) { + for(auto c_lo : c_los ) { + for(auto c_hi : c_his ) { + for(auto f_lo : f_los ) { + for(auto f_hi : f_his ) { + ZeroGuesser CoarseZeroGuesser; + ZeroGuesser CoarseCoarseZeroGuesser; + ConjugateGradient CoarseCoarseCG(tol,10000); + ZeroGuesser CoarseCoarseGuesser; + SchurRedBlackDiagMooeeSolve CoarseCoarseRBCG(CoarseCoarseCG); + SchurSolverWrapper CoarseCoarseSolver(cc_Dwf,CoarseCoarseRBCG); + + std::cout< CoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,CoarseCoarseZeroGuesser); + { +typedef HDCRPreconditioner,nbasisc,LinearFunction > CoarseMG; + typedef MGPreconditioner > ThreeLevelMG; + + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,c_ord,CoarseM,c_Dwf); // 37s, 26 iter + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,c_ord,CoarseM,c_Dwf); + ChebyshevSmoother CoarseSmoother(c_lo,c_hi,c_ord,CoarseM,c_Dwf); // 37s, 26 iter + + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,7,CoarseM,c_Dwf); // 38s, 26 iter + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,7,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.4,22.0,7,CoarseM,c_Dwf); // 41s, 27 iter + // ChebyshevSmoother CoarseSmoother2(0.4,22.0,7,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.6,22.0,6,CoarseM,c_Dwf); // 26 iter + // ChebyshevSmoother CoarseSmoother2(0.6,22.0,6,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,5,CoarseM,c_Dwf); // 33 iter, 55s + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,5,CoarseM,c_Dwf); + + + CoarseMG Level2Precon (CoarseAggregates, + CoarseM, + CoarseSmoother, + CoarseSmoother, + cc_Dwf, + CoarseCoarseSolver); + Level2Precon.Level(2); + + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.5, 100, CoarseM,Level2Precon,16,16); // 26 iter, 37s + // PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2); // 296 s, 50 iter + // PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2); // 250 s, 37 iter + PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2); + + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(1.0, 100, CoarseM,Level2Precon,16,16); // 35 iter, 45s + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.6, 100, CoarseM,Level2Precon,16,16); // 26,38 (diifferene is measurement noise) + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.2, 100, CoarseM,Level2Precon,16,16); // 26 iter, 47s + L2PGCR.Level(2); + + // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space + + // ChebyshevSmoother FineSmoother1(0.5,60.0,14,FineM,Ddwf); // 26 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,14,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 38s + // ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 23 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,20,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,10,FineM,Ddwf);24 iter, 44s + // ChebyshevSmoother FineSmoother2(0.5,60.0,24,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // odd convergence tail at 10^-9 ish + // ChebyshevSmoother FineSmoother2(0.1,60.0,24,FineM,Ddwf); // 33 iter, waas O(10-9 by 26) + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,18,FineM,Ddwf); // + + ChebyshevSmoother FineSmoother(f_lo,f_hi,f_ord,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,11,FineM,Ddwf); // 33 iter, 49s + // ChebyshevSmoother FineSmoother2(0.5,60.0,11,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 26 iter, 37s + // ChebyshevSmoother FineSmoother2(0.5,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.4,60.0,12,FineM,Ddwf); // iter 26 no change in final residual + // ChebyshevSmoother FineSmoother2(0.4,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,12,FineM,Ddwf); // 27 iter 39s. + // ChebyshevSmoother FineSmoother2(0.3,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,13,FineM,Ddwf); // 26 iter, but slower + // ChebyshevSmoother FineSmoother2(0.3,60.0,13,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(1.0,60.0,12,FineM,Ddwf); // 34 iter, slower + // ChebyshevSmoother FineSmoother2(1.0,60.0,12,FineM,Ddwf); + + ThreeLevelMG ThreeLevelPrecon(Aggregates4D, + FineM, + FineSmoother, + FineSmoother, + c_Dwf, + L2PGCR); + ThreeLevelPrecon.Level(1); + + PrecGeneralisedConjugateResidualNonHermitian L1PGCR(1.0e-8,1000,FineM,ThreeLevelPrecon,16,16); + L1PGCR.Level(1); + + f_res=Zero(); + L1PGCR(f_src,f_res); + } + }}}} + }}} + } + std::cout< Date: Thu, 14 Jan 2021 21:00:36 -0500 Subject: [PATCH 108/201] Gparity fix, and plaquette IO --- Grid/parallelIO/IldgIO.h | 22 +++--- Grid/parallelIO/MetaData.h | 34 +++----- Grid/parallelIO/NerscIO.h | 46 +++++------ Grid/parallelIO/OpenQcdIO.h | 2 +- Grid/parallelIO/OpenQcdIOChromaReference.h | 2 +- Grid/qcd/action/gauge/Gauge.cc | 38 +++++++++ Grid/qcd/action/gauge/GaugeImplementations.h | 79 +++++++++++-------- Grid/qcd/hmc/checkpointers/BaseCheckpointer.h | 3 +- Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h | 5 +- .../qcd/hmc/checkpointers/NerscCheckpointer.h | 7 +- Grid/qcd/modules/Modules.h | 2 +- Grid/qcd/utils/CovariantCshift.h | 51 ++++++++++++ Grid/tensors/Tensor_Ta.h | 14 +++- tests/core/Test_reunitarise.cc | 3 +- tests/hmc/Test_hmc_EODWFRatio_Gparity.cc | 7 +- tests/hmc/Test_hmc_GparityIwasakiGauge.cc | 4 + tests/hmc/Test_hmc_GparityWilsonGauge.cc | 3 + 17 files changed, 220 insertions(+), 102 deletions(-) create mode 100644 Grid/qcd/action/gauge/Gauge.cc diff --git a/Grid/parallelIO/IldgIO.h b/Grid/parallelIO/IldgIO.h index b564371b..ef42c159 100644 --- a/Grid/parallelIO/IldgIO.h +++ b/Grid/parallelIO/IldgIO.h @@ -123,7 +123,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5); //////////////////////////////////////////////////////////// // Helper to fill out metadata //////////////////////////////////////////////////////////// - template void ScidacMetaData(Lattice & field, +template void ScidacMetaData(Lattice & field, FieldMetaData &header, scidacRecord & _scidacRecord, scidacFile & _scidacFile) @@ -619,12 +619,12 @@ class IldgWriter : public ScidacWriter { // Don't require scidac records EXCEPT checksum // Use Grid MetaData object if present. //////////////////////////////////////////////////////////////// - template - void writeConfiguration(Lattice > &Umu,int sequence,std::string LFN,std::string description) + template + void writeConfiguration(Lattice &Umu,int sequence,std::string LFN,std::string description) { GridBase * grid = Umu.Grid(); - typedef Lattice > GaugeField; - typedef iLorentzColourMatrix vobj; + typedef Lattice GaugeField; + typedef vLorentzColourMatrixD vobj; typedef typename vobj::scalar_object sobj; //////////////////////////////////////// @@ -636,6 +636,9 @@ class IldgWriter : public ScidacWriter { ScidacMetaData(Umu,header,_scidacRecord,_scidacFile); + stats Stats; + Stats(Umu,header); + std::string format = header.floating_point; header.ensemble_id = description; header.ensemble_label = description; @@ -705,10 +708,10 @@ class IldgReader : public GridLimeReader { // Else use ILDG MetaData object if present. // Else use SciDAC MetaData object if present. //////////////////////////////////////////////////////////////// - template - void readConfiguration(Lattice > &Umu, FieldMetaData &FieldMetaData_) { + template + void readConfiguration(Lattice &Umu, FieldMetaData &FieldMetaData_) { - typedef Lattice > GaugeField; + typedef Lattice GaugeField; typedef typename GaugeField::vector_object vobj; typedef typename vobj::scalar_object sobj; @@ -921,7 +924,8 @@ class IldgReader : public GridLimeReader { if ( found_FieldMetaData || found_usqcdInfo ) { FieldMetaData checker; - GaugeStatistics(Umu,checker); + stats Stats; + Stats(Umu,checker); assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5); assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5); std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl; diff --git a/Grid/parallelIO/MetaData.h b/Grid/parallelIO/MetaData.h index 4c1cfbdb..d30ba523 100644 --- a/Grid/parallelIO/MetaData.h +++ b/Grid/parallelIO/MetaData.h @@ -176,29 +176,18 @@ template inline void PrepareMetaData(Lattice & field, FieldMet GridMetaData(grid,header); MachineCharacteristics(header); } -inline void GaugeStatistics(Lattice & data,FieldMetaData &header) +template +class GaugeStatistics { - // How to convert data precision etc... - header.link_trace=WilsonLoops::linkTrace(data); - header.plaquette =WilsonLoops::avgPlaquette(data); -} -inline void GaugeStatistics(Lattice & data,FieldMetaData &header) -{ - // How to convert data precision etc... - header.link_trace=WilsonLoops::linkTrace(data); - header.plaquette =WilsonLoops::avgPlaquette(data); -} -template<> inline void PrepareMetaData(Lattice & field, FieldMetaData &header) -{ - - GridBase *grid = field.Grid(); - std::string format = getFormatString(); - header.floating_point = format; - header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac - GridMetaData(grid,header); - GaugeStatistics(field,header); - MachineCharacteristics(header); -} +public: + void operator()(Lattice & data,FieldMetaData &header) + { + header.link_trace=WilsonLoops::linkTrace(data); + header.plaquette =WilsonLoops::avgPlaquette(data); + } +}; +typedef GaugeStatistics PeriodicGaugeStatistics; +typedef GaugeStatistics ConjugateGaugeStatistics; template<> inline void PrepareMetaData(Lattice & field, FieldMetaData &header) { GridBase *grid = field.Grid(); @@ -206,7 +195,6 @@ template<> inline void PrepareMetaData(Lattice GaugeField; + static inline void truncate(std::string file){ std::ofstream fout(file,std::ios::out); } @@ -129,12 +131,12 @@ public: // Now the meat: the object readers ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - template - static inline void readConfiguration(Lattice > &Umu, + template + static inline void readConfiguration(GaugeField &Umu, FieldMetaData& header, - std::string file) + std::string file, + GaugeStats GaugeStatisticsCalculator=GaugeStats()) { - typedef Lattice > GaugeField; GridBase *grid = Umu.Grid(); uint64_t offset = readHeader(file,Umu.Grid(),header); @@ -153,23 +155,23 @@ public: // munger is a function of if ( header.data_type == std::string("4D_SU3_GAUGE") ) { if ( ieee32 || ieee32big ) { - BinaryIO::readLatticeObject, LorentzColour2x3F> + BinaryIO::readLatticeObject (Umu,file,Gauge3x2munger(), offset,format, nersc_csum,scidac_csuma,scidac_csumb); } if ( ieee64 || ieee64big ) { - BinaryIO::readLatticeObject, LorentzColour2x3D> + BinaryIO::readLatticeObject (Umu,file,Gauge3x2munger(),offset,format, nersc_csum,scidac_csuma,scidac_csumb); } } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) { if ( ieee32 || ieee32big ) { - BinaryIO::readLatticeObject,LorentzColourMatrixF> + BinaryIO::readLatticeObject (Umu,file,GaugeSimpleMunger(),offset,format, nersc_csum,scidac_csuma,scidac_csumb); } if ( ieee64 || ieee64big ) { - BinaryIO::readLatticeObject,LorentzColourMatrixD> + BinaryIO::readLatticeObject (Umu,file,GaugeSimpleMunger(),offset,format, nersc_csum,scidac_csuma,scidac_csumb); } @@ -177,7 +179,7 @@ public: assert(0); } - GaugeStatistics(Umu,clone); + GaugeStats Stats; Stats(Umu,clone); std::cout< - static inline void writeConfiguration(Lattice > &Umu, + template + static inline void writeConfiguration(Lattice &Umu, std::string file, int two_row, int bits32) { - typedef Lattice > GaugeField; - - typedef iLorentzColourMatrix vobj; + typedef vLorentzColourMatrixD vobj; typedef typename vobj::scalar_object sobj; FieldMetaData header; @@ -229,7 +229,7 @@ public: GridMetaData(grid,header); assert(header.nd==4); - GaugeStatistics(Umu,header); + GaugeStats Stats; Stats(Umu,header); MachineCharacteristics(header); uint64_t offset; @@ -238,19 +238,19 @@ public: header.floating_point = std::string("IEEE64BIG"); header.data_type = std::string("4D_SU3_GAUGE_3x3"); GaugeSimpleUnmunger munge; - if ( grid->IsBoss() ) { - truncate(file); - offset = writeHeader(header,file); - } - grid->Broadcast(0,(void *)&offset,sizeof(offset)); + if ( grid->IsBoss() ) { + truncate(file); + offset = writeHeader(header,file); + } + grid->Broadcast(0,(void *)&offset,sizeof(offset)); uint32_t nersc_csum,scidac_csuma,scidac_csumb; BinaryIO::writeLatticeObject(Umu,file,munge,offset,header.floating_point, nersc_csum,scidac_csuma,scidac_csumb); header.checksum = nersc_csum; - if ( grid->IsBoss() ) { - writeHeader(header,file); - } + if ( grid->IsBoss() ) { + writeHeader(header,file); + } std::cout<Barrier(); timer.Stop(); std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl; - GaugeStatistics(Umu, clone); + PeriodicGaugeStatistics Stats; Stats(Umu, clone); RealD plaq_diff = fabs(clone.plaquette - header.plaquette); diff --git a/Grid/parallelIO/OpenQcdIOChromaReference.h b/Grid/parallelIO/OpenQcdIOChromaReference.h index bab54fe8..886536ad 100644 --- a/Grid/parallelIO/OpenQcdIOChromaReference.h +++ b/Grid/parallelIO/OpenQcdIOChromaReference.h @@ -208,7 +208,7 @@ public: FieldMetaData clone(header); - GaugeStatistics(Umu, clone); + PeriodicGaugeStatistics Stats; Stats(Umu, clone); RealD plaq_diff = fabs(clone.plaquette - header.plaquette); diff --git a/Grid/qcd/action/gauge/Gauge.cc b/Grid/qcd/action/gauge/Gauge.cc new file mode 100644 index 00000000..2b5e2691 --- /dev/null +++ b/Grid/qcd/action/gauge/Gauge.cc @@ -0,0 +1,38 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/gauge/Gauge.cc + +Copyright (C) 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +NAMESPACE_BEGIN(Grid); + +std::vector ConjugateGaugeImplBase::_conjDirs; + +NAMESPACE_END(Grid); + diff --git a/Grid/qcd/action/gauge/GaugeImplementations.h b/Grid/qcd/action/gauge/GaugeImplementations.h index a14aec1b..16147c77 100644 --- a/Grid/qcd/action/gauge/GaugeImplementations.h +++ b/Grid/qcd/action/gauge/GaugeImplementations.h @@ -59,14 +59,14 @@ public: } static inline GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) { - return Cshift(adj(Link), mu, -1); + return PeriodicBC::CovShiftIdentityBackward(Link, mu); } static inline GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) { - return Link; + return PeriodicBC::CovShiftIdentityForward(Link,mu); } static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) { - return Cshift(Link, mu, 1); + return PeriodicBC::ShiftStaple(Link,mu); } static inline bool isPeriodicGaugeField(void) { return true; } @@ -74,7 +74,13 @@ public: // Composition with smeared link, bc's etc.. probably need multiple inheritance // Variable precision "S" and variable Nc -template class ConjugateGaugeImpl : public GimplTypes { +class ConjugateGaugeImplBase { +protected: + static std::vector _conjDirs; +}; + + template class ConjugateGaugeImpl : public GimplTypes, ConjugateGaugeImplBase { +private: public: INHERIT_GIMPL_TYPES(GimplTypes); @@ -84,47 +90,56 @@ public: //////////////////////////////////////////////////////////////////////////////////////////////////////////// template static Lattice CovShiftForward(const GaugeLinkField &Link, int mu, - const Lattice &field) { - return ConjugateBC::CovShiftForward(Link, mu, field); + const Lattice &field) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::CovShiftForward(Link, mu, field); + else + return PeriodicBC::CovShiftForward(Link, mu, field); } template static Lattice CovShiftBackward(const GaugeLinkField &Link, int mu, - const Lattice &field) { - return ConjugateBC::CovShiftBackward(Link, mu, field); + const Lattice &field) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::CovShiftBackward(Link, mu, field); + else + return PeriodicBC::CovShiftBackward(Link, mu, field); } static inline GaugeLinkField - CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) { - GridBase *grid = Link.Grid(); - int Lmu = grid->GlobalDimensions()[mu] - 1; - - Lattice> coor(grid); - LatticeCoordinate(coor, mu); - - GaugeLinkField tmp(grid); - tmp = adj(Link); - tmp = where(coor == Lmu, conjugate(tmp), tmp); - return Cshift(tmp, mu, -1); // moves towards positive mu + CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::CovShiftIdentityBackward(Link, mu); + else + return PeriodicBC::CovShiftIdentityBackward(Link, mu); } static inline GaugeLinkField - CovShiftIdentityForward(const GaugeLinkField &Link, int mu) { - return Link; + CovShiftIdentityForward(const GaugeLinkField &Link, int mu) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::CovShiftIdentityForward(Link,mu); + else + return PeriodicBC::CovShiftIdentityForward(Link,mu); } - static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) { - GridBase *grid = Link.Grid(); - int Lmu = grid->GlobalDimensions()[mu] - 1; - - Lattice> coor(grid); - LatticeCoordinate(coor, mu); - - GaugeLinkField tmp(grid); - tmp = Cshift(Link, mu, 1); - tmp = where(coor == Lmu, conjugate(tmp), tmp); - return tmp; + static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::ShiftStaple(Link,mu); + else + return PeriodicBC::ShiftStaple(Link,mu); } + static inline void setDirections(std::vector &conjDirs) { _conjDirs=conjDirs; } + static inline std::vector getDirections(void) { return _conjDirs; } static inline bool isPeriodicGaugeField(void) { return false; } }; diff --git a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h index 3cd05ebc..c09fdeeb 100644 --- a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h +++ b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h @@ -74,7 +74,7 @@ public: conf_file = os.str(); } } - + virtual ~BaseHmcCheckpointer(){}; void check_filename(const std::string &filename){ std::ifstream f(filename.c_str()); if(!f.good()){ @@ -82,7 +82,6 @@ public: abort(); }; } - virtual void initialize(const CheckpointerParameters &Params) = 0; virtual void CheckpointRestore(int traj, typename Impl::Field &U, diff --git a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h index 269caa6e..1bb8aa1a 100644 --- a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h +++ b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h @@ -45,6 +45,7 @@ private: public: INHERIT_GIMPL_TYPES(Implementation); + typedef GaugeStatistics GaugeStats; ILDGHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); } @@ -78,7 +79,7 @@ public: BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); IldgWriter _IldgWriter(grid->IsBoss()); _IldgWriter.open(config); - _IldgWriter.writeConfiguration(U, traj, config, config); + _IldgWriter.writeConfiguration(U, traj, config, config); _IldgWriter.close(); std::cout << GridLogMessage << "Written ILDG Configuration on " << config @@ -105,7 +106,7 @@ public: FieldMetaData header; IldgReader _IldgReader; _IldgReader.open(config); - _IldgReader.readConfiguration(U,header); // format from the header + _IldgReader.readConfiguration(U,header); // format from the header _IldgReader.close(); std::cout << GridLogMessage << "Read ILDG Configuration from " << config diff --git a/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h index cfcc44d8..4534e4c4 100644 --- a/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h +++ b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h @@ -43,7 +43,8 @@ private: public: INHERIT_GIMPL_TYPES(Gimpl); // only for gauge configurations - + typedef GaugeStatistics GaugeStats; + NerscHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); } void initialize(const CheckpointerParameters &Params_) { @@ -60,7 +61,7 @@ public: int precision32 = 1; int tworow = 0; NerscIO::writeRNGState(sRNG, pRNG, rng); - NerscIO::writeConfiguration(U, config, tworow, precision32); + NerscIO::writeConfiguration(U, config, tworow, precision32); } }; @@ -74,7 +75,7 @@ public: FieldMetaData header; NerscIO::readRNGState(sRNG, pRNG, header, rng); - NerscIO::readConfiguration(U, header, config); + NerscIO::readConfiguration(U, header, config); }; }; diff --git a/Grid/qcd/modules/Modules.h b/Grid/qcd/modules/Modules.h index 1c1c8889..7aa3f0ac 100644 --- a/Grid/qcd/modules/Modules.h +++ b/Grid/qcd/modules/Modules.h @@ -99,7 +99,7 @@ public: virtual Prod* getPtr() = 0; // add a getReference? - + virtual ~HMCModuleBase(){}; virtual void print_parameters(){}; // default to nothing }; diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h index cee1fa12..6c70706f 100644 --- a/Grid/qcd/utils/CovariantCshift.h +++ b/Grid/qcd/utils/CovariantCshift.h @@ -53,6 +53,24 @@ namespace PeriodicBC { return Cshift(tmp,mu,-1);// moves towards positive mu } + template Lattice + CovShiftIdentityBackward(const Lattice &Link, int mu) + { + return Cshift(adj(Link), mu, -1); + } + + template Lattice + CovShiftIdentityForward(const Lattice &Link, int mu) + { + return Link; + } + + template Lattice + ShiftStaple(const Lattice &Link, int mu) + { + return Cshift(Link, mu, 1); + } + template::value,void>::type * = nullptr> auto CovShiftForward(const Lattice &Link, int mu, @@ -70,6 +88,7 @@ namespace PeriodicBC { return CovShiftBackward(Link,mu,arg); } + } @@ -139,6 +158,38 @@ namespace ConjugateBC { // std::cout<<"Gparity::CovCshiftBackward mu="< Lattice + CovShiftIdentityBackward(const Lattice &Link, int mu) { + GridBase *grid = Link.Grid(); + int Lmu = grid->GlobalDimensions()[mu] - 1; + + Lattice> coor(grid); + LatticeCoordinate(coor, mu); + + Lattice tmp(grid); + tmp = adj(Link); + tmp = where(coor == Lmu, conjugate(tmp), tmp); + return Cshift(tmp, mu, -1); // moves towards positive mu + } + template Lattice + CovShiftIdentityForward(const Lattice &Link, int mu) { + return Link; + } + + template Lattice + ShiftStaple(const Lattice &Link, int mu) + { + GridBase *grid = Link.Grid(); + int Lmu = grid->GlobalDimensions()[mu] - 1; + + Lattice> coor(grid); + LatticeCoordinate(coor, mu); + + Lattice tmp(grid); + tmp = Cshift(Link, mu, 1); + tmp = where(coor == Lmu, conjugate(tmp), tmp); + return tmp; + } template::value,void>::type * = nullptr> auto CovShiftForward(const Lattice &Link, diff --git a/Grid/tensors/Tensor_Ta.h b/Grid/tensors/Tensor_Ta.h index bbaa4a00..90e57b2b 100644 --- a/Grid/tensors/Tensor_Ta.h +++ b/Grid/tensors/Tensor_Ta.h @@ -117,7 +117,19 @@ accelerator_inline iMatrix ProjectOnGroup(const iMatrix &arg) ret._internal[b][c] -= pr * ret._internal[c1][c]; } } - + } + + // Normalise last row + { + int c1 = N-1; + zeroit(inner); + for(int c2=0;c2(U,Nc-1,i); element = element * phase; PokeIndex(U,element,Nc-1,i); - } + } + U=U*0.1; UU=U; detU= Determinant(U) ; diff --git a/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc b/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc index 3434fccc..9ca0b0a0 100644 --- a/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc +++ b/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc @@ -81,6 +81,10 @@ int main(int argc, char **argv) { // that have a complex construction // standard RealD beta = 5.6 ; + const int nu = 3; + std::vector twists(Nd,0); + twists[nu] = 1; + ConjugateGimplD::setDirections(twists); ConjugateWilsonGaugeActionR Waction(beta); const int Ls = 8; @@ -93,9 +97,6 @@ int main(int argc, char **argv) { // temporarily need a gauge field LatticeGaugeField U(GridPtr); - const int nu = 3; - std::vector twists(Nd,0); - twists[nu] = 1; FermionAction::ImplParams params; params.twists = twists; Real mass=0.04; diff --git a/tests/hmc/Test_hmc_GparityIwasakiGauge.cc b/tests/hmc/Test_hmc_GparityIwasakiGauge.cc index bc47b6c2..7f74d5d8 100644 --- a/tests/hmc/Test_hmc_GparityIwasakiGauge.cc +++ b/tests/hmc/Test_hmc_GparityIwasakiGauge.cc @@ -79,6 +79,10 @@ int main(int argc, char **argv) { // that have a complex construction // standard RealD beta = 2.6 ; + const int nu = 3; + std::vector twists(Nd,0); + twists[nu] = 1; + ConjugateGimplD::setDirections(twists); ConjugateIwasakiGaugeActionR Waction(beta); diff --git a/tests/hmc/Test_hmc_GparityWilsonGauge.cc b/tests/hmc/Test_hmc_GparityWilsonGauge.cc index eb057181..b8c078fe 100644 --- a/tests/hmc/Test_hmc_GparityWilsonGauge.cc +++ b/tests/hmc/Test_hmc_GparityWilsonGauge.cc @@ -80,6 +80,9 @@ int main(int argc, char **argv) { // that have a complex construction // standard RealD beta = 5.6 ; + std::vector twists(Nd,0); + twists[3] = 1; + ConjugateGimplD::setDirections(twists); ConjugateWilsonGaugeActionR Waction(beta); From 3c23a947cc4e22b6c01afd9eac5d5a4add9035c7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 15 Jan 2021 09:16:02 -0500 Subject: [PATCH 109/201] Fixed test for very much non-unit det --- tests/core/Test_reunitarise.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/Test_reunitarise.cc b/tests/core/Test_reunitarise.cc index af164a75..6644be1a 100644 --- a/tests/core/Test_reunitarise.cc +++ b/tests/core/Test_reunitarise.cc @@ -103,7 +103,7 @@ int main (int argc, char ** argv) detU= Determinant(U) ; detU=detU-1.0; - std::cout << "Determinant before screw up " << norm2(detU)< Date: Mon, 18 Jan 2021 18:57:05 +0000 Subject: [PATCH 110/201] bugfix --- Grid/qcd/utils/BaryonUtils.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 69bf8959..edc5c8d5 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -951,7 +951,7 @@ void BaryonUtils::BaryonGamma3pt( spinor result=Zero(); BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec_p[0],Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); //coalescedWrite(vcorr[ss],vcorr[ss]+result); //diff by factor 10??? - coalescedWrite(vcorr[ss],vcorr[ss]+result); + coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); });//end loop over lattice sites } else if (group == 2) { @@ -961,7 +961,7 @@ void BaryonUtils::BaryonGamma3pt( typedef decltype(coalescedRead(vcorr[0])) spinor; spinor result=Zero(); BaryonGamma3ptGroup2Site(Dq_spec_p[0],Dq_ti,Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - coalescedWrite(vcorr[ss],vcorr[ss]+result); + coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); });//end loop over lattice sites } else if (group == 3) { accelerator_for(ss, grid->oSites(), grid->Nsimd(), { @@ -970,7 +970,7 @@ void BaryonUtils::BaryonGamma3pt( typedef decltype(coalescedRead(vcorr[0])) spinor; spinor result=Zero(); BaryonGamma3ptGroup3Site(Dq_spec_p[0],Dq_spec_p[1],Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - coalescedWrite(vcorr[ss],vcorr[ss]+result); + coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); });//end loop over lattice sites } From 8bfa0e74f837c914efaf95929149f9ce5b0a5487 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Tue, 19 Jan 2021 12:27:57 +0000 Subject: [PATCH 111/201] final version, tested on CPU and GPU --- Grid/qcd/utils/BaryonUtils.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index edc5c8d5..ca8b66ef 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -950,7 +950,6 @@ void BaryonUtils::BaryonGamma3pt( typedef decltype(coalescedRead(vcorr[0])) spinor; spinor result=Zero(); BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec_p[0],Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - //coalescedWrite(vcorr[ss],vcorr[ss]+result); //diff by factor 10??? coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); });//end loop over lattice sites From fc6d07897fe4a9f7f1a80bd9a7849fe373648b42 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Tue, 19 Jan 2021 12:32:48 +0000 Subject: [PATCH 112/201] revert changes --- tests/solver/Test_zMADWF_prec.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/solver/Test_zMADWF_prec.cc b/tests/solver/Test_zMADWF_prec.cc index f18e1d86..d1168764 100644 --- a/tests/solver/Test_zMADWF_prec.cc +++ b/tests/solver/Test_zMADWF_prec.cc @@ -52,7 +52,7 @@ struct TestParams{ bool zmobius_inner; double lambda_max; //upper bound of H_T eigenvalue range required to generate zMobius approximation - TestParams(): load_config(false), config_file("ckpoint_lat.1000"), mass(0.01), + TestParams(): load_config(true), config_file("ckpoint_lat.1000"), mass(0.01), Ls_outer(24), b_plus_c_outer(2.0), resid_outer(1e-8), Ls_inner(12), b_plus_c_inner(1.0), resid_inner(1e-8), zmobius_inner(true), lambda_max(1.42), outer_precon("Standard"), inner_precon("Standard") {} @@ -246,7 +246,7 @@ void run(const TestParams ¶ms){ typename RunParamsInner::SchurSolverType SchurSolver_inner(CG_inner); ZeroGuesser Guess; - MADWF > madwf(D_outer, D_inner, PV_outer, SchurSolver_inner, Guess, params.resid_outer, 10000, &update); + MADWF > madwf(D_outer, D_inner, PV_outer, SchurSolver_inner, Guess, params.resid_outer, 100, &update); LatticeFermionD result_MADWF(FGrid_outer); result_MADWF = Zero(); From df16202865e5ff8277b805336b12a054b316ea08 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Tue, 19 Jan 2021 19:25:27 +0000 Subject: [PATCH 113/201] weird bug in 2pt function... --- Grid/qcd/utils/BaryonUtils.h | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index ca8b66ef..7393c232 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -513,6 +513,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, GridBase *grid = q1_left.Grid(); autoView(vbaryon_corr , baryon_corr , AcceleratorWrite); + autoView( vcorr_read , baryon_corr , AcceleratorRead); autoView( v1 , q1_left , AcceleratorRead); autoView( v2 , q2_left , AcceleratorRead); autoView( v3 , q3_left , AcceleratorRead); @@ -533,7 +534,8 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, auto D1 = v1(ss); auto D2 = v2(ss); auto D3 = v3(ss); - typedef decltype(coalescedRead(vbaryon_corr[0])) cVec; + //typedef decltype(coalescedRead(vbaryon_corr[0])) cVec; + typedef decltype(coalescedRead(vcorr_read[0])) cVec; cVec result=Zero(); BaryonSite(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result); coalescedWrite(vbaryon_corr[ss],result); @@ -562,6 +564,7 @@ void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, GridBase *grid = q1_left.Grid(); autoView(vbaryon_corr , baryon_corr , AcceleratorWrite); + autoView( vcorr_read , baryon_corr , AcceleratorRead); autoView( v1 , q1_left , AcceleratorRead); autoView( v2 , q2_left , AcceleratorRead); autoView( v3 , q3_left , AcceleratorRead); @@ -570,7 +573,8 @@ void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, auto D1 = v1(ss); auto D2 = v2(ss); auto D3 = v3(ss); - typedef decltype(coalescedRead(vbaryon_corr[0])) spinor; + //typedef decltype(coalescedRead(vbaryon_corr[0])) spinor; + typedef decltype(coalescedRead(vcorr_read[0])) spinor; spinor result=Zero(); BaryonSiteMatrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result); coalescedWrite(vbaryon_corr[ss],result); @@ -937,6 +941,7 @@ void BaryonUtils::BaryonGamma3pt( GridBase *grid = q_tf.Grid(); autoView( vcorr , stn_corr , AcceleratorWrite); + autoView( vcorr_read , stn_corr , AcceleratorRead); autoView( vq_ti , q_ti , AcceleratorRead); autoView( vq_tf , q_tf , AcceleratorRead); @@ -947,29 +952,29 @@ void BaryonUtils::BaryonGamma3pt( accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto Dq_ti = vq_ti(ss); auto Dq_tf = vq_tf(ss); - typedef decltype(coalescedRead(vcorr[0])) spinor; + typedef decltype(coalescedRead(vcorr_read[0])) spinor; spinor result=Zero(); BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec_p[0],Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); + coalescedWrite(vcorr[ss],coalescedRead(vcorr_read[ss])+result); });//end loop over lattice sites } else if (group == 2) { accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto Dq_ti = vq_ti(ss); auto Dq_tf = vq_tf(ss); - typedef decltype(coalescedRead(vcorr[0])) spinor; + typedef decltype(coalescedRead(vcorr_read[0])) spinor; spinor result=Zero(); BaryonGamma3ptGroup2Site(Dq_spec_p[0],Dq_ti,Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); + coalescedWrite(vcorr[ss],coalescedRead(vcorr_read[ss])+result); });//end loop over lattice sites } else if (group == 3) { accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto Dq_ti = vq_ti(ss); auto Dq_tf = vq_tf(ss); - typedef decltype(coalescedRead(vcorr[0])) spinor; + typedef decltype(coalescedRead(vcorr_read[0])) spinor; spinor result=Zero(); BaryonGamma3ptGroup3Site(Dq_spec_p[0],Dq_spec_p[1],Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); + coalescedWrite(vcorr[ss],coalescedRead(vcorr_read[ss])+result); });//end loop over lattice sites } From ff1fa988085ae868ae603fe9562968b3dcd57da2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 Jan 2021 21:38:23 -0500 Subject: [PATCH 114/201] Fix for GPU conserveed current --- Grid/qcd/action/fermion/WilsonImpl.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index 52e1ee00..d7941d1f 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -106,11 +106,15 @@ public: const _SpinorField & phi, int mu) { + const int Nsimd = SiteHalfSpinor::Nsimd(); autoView( out_v, out, AcceleratorWrite); autoView( phi_v, phi, AcceleratorRead); autoView( Umu_v, Umu, AcceleratorRead); - accelerator_for(sss,out.Grid()->oSites(),1,{ - multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu); + typedef decltype(coalescedRead(out_v[0])) calcSpinor; + accelerator_for(sss,out.Grid()->oSites(),Nsimd,{ + calcSpinor tmp; + multLink(tmp,Umu_v[sss],phi_v(sss),mu); + coalescedWrite(out_v[sss],tmp); }); } From 11a5fd09d65427aaaa68d9dc28318bfe92b08097 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 21 Jan 2021 21:39:41 -0500 Subject: [PATCH 115/201] Hot config --- tests/debug/Test_cayley_mres.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/debug/Test_cayley_mres.cc b/tests/debug/Test_cayley_mres.cc index 5282c756..ea88885e 100644 --- a/tests/debug/Test_cayley_mres.cc +++ b/tests/debug/Test_cayley_mres.cc @@ -117,8 +117,8 @@ int main (int argc, char ** argv) else { std::cout<::ColdConfiguration(Umu); - // SU::HotConfiguration(RNG4,Umu); + //SU::ColdConfiguration(Umu); + SU::HotConfiguration(RNG4,Umu); } RealD mass=0.3; From 2983b6fdf6485569c7d621bb22e4dcec23633e22 Mon Sep 17 00:00:00 2001 From: Michael Marshall <43034299+mmphys@users.noreply.github.com> Date: Sat, 23 Jan 2021 12:41:48 +0000 Subject: [PATCH 116/201] Optional (superficial) changes to make comparison with Hadrons WardIdentity module easier: use Schur solver; example of Hadrons random gauge init; logging updates; only solve reverse propagator if provided --- tests/debug/Test_cayley_mres.cc | 86 +++++++++++++++------------------ 1 file changed, 40 insertions(+), 46 deletions(-) diff --git a/tests/debug/Test_cayley_mres.cc b/tests/debug/Test_cayley_mres.cc index ea88885e..bfbc3cf7 100644 --- a/tests/debug/Test_cayley_mres.cc +++ b/tests/debug/Test_cayley_mres.cc @@ -33,13 +33,14 @@ using namespace Grid; template -void TestConserved(What & Ddwf, What & Ddwfrev, +void TestConserved(What & Ddwf, LatticeGaugeField &Umu, GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid, GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid, RealD mass, RealD M5, GridParallelRNG *RNG4, - GridParallelRNG *RNG5); + GridParallelRNG *RNG5, + What *Ddwfrev=nullptr); Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, @@ -102,10 +103,11 @@ int main (int argc, char ** argv) GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF); - std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); - GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG4(UGrid); + std::vector seeds4({1,2,3,4}); RNG4.SeedFixedIntegers(seeds4); + //const std::string seeds4{ "test-gauge-3000" }; RNG4.SeedUniqueString( seeds4 ); LatticeGaugeField Umu(UGrid); if( argc > 1 && argv[1][0] != '-' ) @@ -116,8 +118,8 @@ int main (int argc, char ** argv) } else { - std::cout<::ColdConfiguration(Umu); + std::cout<::ColdConfiguration(Umu); SU::HotConfiguration(RNG4,Umu); } @@ -127,7 +129,7 @@ int main (int argc, char ** argv) std::cout<(Ddwf,Ddwf,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestConserved(Ddwf,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); RealD b=1.5;// Scale factor b+c=2, b-c=1 RealD c=0.5; @@ -137,13 +139,13 @@ int main (int argc, char ** argv) std::cout<(Dmob,Dmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestConserved(Dmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dsham,Dsham,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestConserved(Dsham,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(ZDmob,ZDmobrev,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestConserved(ZDmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5,&ZDmobrev); Grid_finalize(); } @@ -161,22 +162,17 @@ int main (int argc, char ** argv) template -void TestConserved(Action & Ddwf, - Action & Ddwfrev, +void TestConserved(Action & Ddwf, LatticeGaugeField &Umu, GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid, GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid, RealD mass, RealD M5, GridParallelRNG *RNG4, - GridParallelRNG *RNG5) + GridParallelRNG *RNG5, + Action * Ddwfrev) { - int Ls=Ddwf.Ls; - - LatticePropagator phys_src(UGrid); - - std::vector U(4,UGrid); - - LatticePropagator seqsrc(FGrid); + LatticePropagator phys_src(UGrid); + LatticePropagator seqsrc(FGrid); LatticePropagator prop5(FGrid); LatticePropagator prop5rev(FGrid); LatticePropagator prop4(UGrid); @@ -194,9 +190,9 @@ void TestConserved(Action & Ddwf, phys_src=Zero(); pokeSite(kronecker,phys_src,coor); - MdagMLinearOperator HermOp(Ddwf); - MdagMLinearOperator HermOprev(Ddwfrev); ConjugateGradient CG(1.0e-16,100000); + SchurRedBlackDiagTwoSolve schur(CG); + ZeroGuesser zpg; for(int s=0;s(prop5,result5,s,c); LatticeFermion result4(UGrid); Ddwf.ExportPhysicalFermionSolution(result5,result4); FermToProp(prop4,result4,s,c); - Ddwfrev.ImportPhysicalFermionSource(src4,src5); - Ddwfrev.Mdag(src5,Mdagsrc5); - CG(HermOprev,Mdagsrc5,result5); + if( Ddwfrev ) { + Ddwfrev->ImportPhysicalFermionSource(src4,src5); + result5 = Zero(); + schur(*Ddwfrev,src5,result5,zpg); + } FermToProp(prop5rev,result5,s,c); } } @@ -251,11 +247,7 @@ void TestConserved(Action & Ddwf, PropToFerm(src5,seqsrc,s,c); LatticeFermion result5(FGrid); result5=Zero(); - - // CGNE - LatticeFermion Mdagsrc5 (FGrid); - Ddwf.Mdag(src5,Mdagsrc5); - CG(HermOp,Mdagsrc5,result5); + schur(Ddwf,src5,result5,zpg); LatticeFermion result4(UGrid); Ddwf.ExportPhysicalFermionSolution(result5,result4); @@ -276,10 +268,10 @@ void TestConserved(Action & Ddwf, Ddwf.ContractConservedCurrent(prop5rev,prop5,Vector_mu,phys_src,Current::Vector,Tdir); Ddwf.ContractJ5q(prop5,PJ5q); - PA = trace(g5*Axial_mu); - SV = trace(Vector_mu); - VV = trace(gT*Vector_mu); - PP = trace(adj(prop4)*prop4); + PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current + SV = trace(Vector_mu); // Scalar-Vector conserved current + VV = trace(gT*Vector_mu); // (local) Vector-Vector conserved current + PP = trace(adj(prop4)*prop4); // Pseudoscalar density // Spatial sum sliceSum(PA,sumPA,Tdir); @@ -288,15 +280,17 @@ void TestConserved(Action & Ddwf, sliceSum(PP,sumPP,Tdir); sliceSum(PJ5q,sumPJ5q,Tdir); - int Nt=sumPA.size(); + const int Nt{static_cast(sumPA.size())}; + std::cout< Date: Mon, 25 Jan 2021 15:09:36 +0000 Subject: [PATCH 117/201] Fix issue for GPU by ensuring accelerator_inline version of convertType is available for Grid::complex. This removes many warnings in Hadrons Simplify the SFINAE syntax and correct convertType for iScalar --- Grid/lattice/Lattice_transfer.h | 39 +++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 91de721f..c91fa4d1 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -97,6 +97,21 @@ accelerator_inline void convertType(ComplexF & out, const std::complex & out = in; } +template +accelerator_inline typename std::enable_if::value>::type +convertType(T & out, const T & in) { + out = in; +} + +// This would allow for conversions between GridFundamental types, but is not strictly needed as yet +/*template +accelerator_inline typename std::enable_if::value && isGridFundamental::value>::type +// Or to make this very broad, conversions between anything that's not a GridTensor could be allowed +//accelerator_inline typename std::enable_if::value && !isGridTensor::value>::type +convertType(T1 & out, const T2 & in) { + out = in; +}*/ + #ifdef GRID_SIMT accelerator_inline void convertType(vComplexF & out, const ComplexF & in) { ((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in; @@ -117,23 +132,20 @@ accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) { Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v); } -template - accelerator_inline void convertType(iMatrix & out, const iMatrix & in); -template - accelerator_inline void convertType(iVector & out, const iVector & in); - -template::value, T1>::type* = nullptr> -accelerator_inline void convertType(T1 & out, const iScalar & in) { - convertType(out,in._internal); +template +accelerator_inline void convertType(iScalar & out, const iScalar & in) { + convertType(out._internal,in._internal); } -template::value, T1>::type* = nullptr> -accelerator_inline void convertType(T1 & out, const iScalar & in) { +template +accelerator_inline typename std::enable_if::value>::type +convertType(T1 & out, const iScalar & in) { convertType(out,in._internal); } template -accelerator_inline void convertType(iScalar & out, const T2 & in) { +accelerator_inline typename std::enable_if::value>::type +convertType(iScalar & out, const T2 & in) { convertType(out._internal,in); } @@ -150,11 +162,6 @@ accelerator_inline void convertType(iVector & out, const iVector & i convertType(out._internal[i],in._internal[i]); } -template::value, T>::type* = nullptr> -accelerator_inline void convertType(T & out, const T & in) { - out = in; -} - template accelerator_inline void convertType(Lattice & out, const Lattice & in) { autoView( out_v , out,AcceleratorWrite); From 81d88d9f4df1c9944c8b33db6dd123c36560757d Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Wed, 27 Jan 2021 21:09:51 +0000 Subject: [PATCH 118/201] fixes --- Grid/qcd/utils/BaryonUtils.h | 63 ++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 7393c232..d6b48ba0 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -52,7 +52,7 @@ public: const Gamma GammaA_right, const Gamma GammaB_right, const int parity, - const bool * wick_contractions, + const int wick_contractions, robj &result); template accelerator_inline static void BaryonSiteMatrix(const mobj &D1, @@ -62,12 +62,12 @@ public: const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const bool * wick_contractions, + const int wick_contractions, robj &result); public: static void WickContractions(std::string qi, std::string qf, - bool* wick_contractions); + int &wick_contractions); static void ContractBaryons(const PropagatorField &q1_left, const PropagatorField &q2_left, const PropagatorField &q3_left, @@ -75,7 +75,7 @@ public: const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const bool* wick_contractions, + const int wick_contractions, const int parity, ComplexField &baryon_corr); static void ContractBaryonsMatrix(const PropagatorField &q1_left, @@ -85,7 +85,7 @@ public: const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const bool* wick_contractions, + const int wick_contractions, SpinMatrixField &baryon_corr); template static void ContractBaryonsSliced(const mobj &D1, @@ -95,7 +95,7 @@ public: const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const bool* wick_contractions, + const int wick_contractions, const int parity, const int nt, robj &result); @@ -107,7 +107,7 @@ public: const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const bool* wick_contractions, + const int wick_contractions, const int nt, robj &result); private: @@ -234,7 +234,7 @@ void BaryonUtils::BaryonSite(const mobj &D1, const Gamma GammaA_f, const Gamma GammaB_f, const int parity, - const bool * wick_contraction, + const int wick_contraction, robj &result) { @@ -268,7 +268,7 @@ void BaryonUtils::BaryonSite(const mobj &D1, ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; //This is the \delta_{456}^{123} part - if (wick_contraction[0]){ + if (wick_contraction & 1){ for (int rho=0; rho::BaryonSite(const mobj &D1, } } //This is the \delta_{456}^{231} part - if (wick_contraction[1]){ + if (wick_contraction & 2){ for (int rho=0; rho::BaryonSite(const mobj &D1, }} } //This is the \delta_{456}^{312} part - if (wick_contraction[2]){ + if (wick_contraction & 4){ for (int rho=0; rho::BaryonSite(const mobj &D1, }} } //This is the \delta_{456}^{132} part - if (wick_contraction[3]){ + if (wick_contraction & 8){ for (int rho=0; rho::BaryonSite(const mobj &D1, } } //This is the \delta_{456}^{321} part - if (wick_contraction[4]){ + if (wick_contraction & 16){ for (int rho=0; rho::BaryonSite(const mobj &D1, }} } //This is the \delta_{456}^{213} part - if (wick_contraction[5]){ + if (wick_contraction & 32){ for (int rho=0; rho::BaryonSiteMatrix(const mobj &D1, const Gamma GammaB_i, const Gamma GammaA_f, const Gamma GammaB_f, - const bool * wick_contraction, + const int wick_contraction, robj &result) { @@ -383,7 +383,7 @@ void BaryonUtils::BaryonSiteMatrix(const mobj &D1, ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; //This is the \delta_{456}^{123} part - if (wick_contraction[0]){ + if (wick_contraction & 1){ for (int rho_i=0; rho_i::BaryonSiteMatrix(const mobj &D1, }} } //This is the \delta_{456}^{231} part - if (wick_contraction[1]){ + if (wick_contraction & 2){ for (int rho_i=0; rho_i::BaryonSiteMatrix(const mobj &D1, }} } //This is the \delta_{456}^{312} part - if (wick_contraction[2]){ + if (wick_contraction & 4){ for (int rho_i=0; rho_i::BaryonSiteMatrix(const mobj &D1, }} } //This is the \delta_{456}^{132} part - if (wick_contraction[3]){ + if (wick_contraction & 8){ for (int rho_i=0; rho_i::BaryonSiteMatrix(const mobj &D1, }} } //This is the \delta_{456}^{321} part - if (wick_contraction[4]){ + if (wick_contraction & 16){ for (int rho_i=0; rho_i::BaryonSiteMatrix(const mobj &D1, }} } //This is the \delta_{456}^{213} part - if (wick_contraction[5]){ + if (wick_contraction & 32){ for (int rho_i=0; rho_i::BaryonSiteMatrix(const mobj &D1, * flavours. * * The array wick_contractions must be of length 6 */ template -void BaryonUtils::WickContractions(std::string qi, std::string qf, bool* wick_contractions) { +void BaryonUtils::WickContractions(std::string qi, std::string qf, int &wick_contractions) { + assert(qi.size() == 3 && qf.size() == 3 && "Only sets of 3 quarks accepted."); const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; + wick_contractions=0; for (int ie=0; ie < 6 ; ie++) { - wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3 - && qi[0] == qf[epsilon[ie][0]] + wick_contractions += ( ( qi[0] == qf[epsilon[ie][0]] && qi[1] == qf[epsilon[ie][1]] - && qi[2] == qf[epsilon[ie][2]]); + && qi[2] == qf[epsilon[ie][2]]) ? 1 : 0) << ie; } } @@ -500,7 +501,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const bool* wick_contractions, + const int wick_contractions, const int parity, ComplexField &baryon_corr) { @@ -522,9 +523,9 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real)); for (int ie=0; ie < 6 ; ie++){ if(ie==0 or ie==3){ - bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie]; + //bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie]; } else{ - bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie]; + //bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie]; } } Real t=0.; @@ -554,7 +555,7 @@ void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const bool* wick_contractions, + const int wick_contractions, SpinMatrixField &baryon_corr) { @@ -595,7 +596,7 @@ void BaryonUtils::ContractBaryonsSliced(const mobj &D1, const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const bool* wick_contractions, + const int wick_contractions, const int parity, const int nt, robj &result) @@ -620,7 +621,7 @@ void BaryonUtils::ContractBaryonsSlicedMatrix(const mobj &D1, const Gamma GammaB_left, const Gamma GammaA_right, const Gamma GammaB_right, - const bool* wick_contractions, + const int wick_contractions, const int nt, robj &result) { From 712bb406502922fb0ceb733e7a74f0ce2b902e2c Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Tue, 15 Dec 2020 16:33:29 +0000 Subject: [PATCH 119/201] merge develop --- tests/solver/Test_zMADWF_prec.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/solver/Test_zMADWF_prec.cc b/tests/solver/Test_zMADWF_prec.cc index d1168764..f18e1d86 100644 --- a/tests/solver/Test_zMADWF_prec.cc +++ b/tests/solver/Test_zMADWF_prec.cc @@ -52,7 +52,7 @@ struct TestParams{ bool zmobius_inner; double lambda_max; //upper bound of H_T eigenvalue range required to generate zMobius approximation - TestParams(): load_config(true), config_file("ckpoint_lat.1000"), mass(0.01), + TestParams(): load_config(false), config_file("ckpoint_lat.1000"), mass(0.01), Ls_outer(24), b_plus_c_outer(2.0), resid_outer(1e-8), Ls_inner(12), b_plus_c_inner(1.0), resid_inner(1e-8), zmobius_inner(true), lambda_max(1.42), outer_precon("Standard"), inner_precon("Standard") {} @@ -246,7 +246,7 @@ void run(const TestParams ¶ms){ typename RunParamsInner::SchurSolverType SchurSolver_inner(CG_inner); ZeroGuesser Guess; - MADWF > madwf(D_outer, D_inner, PV_outer, SchurSolver_inner, Guess, params.resid_outer, 100, &update); + MADWF > madwf(D_outer, D_inner, PV_outer, SchurSolver_inner, Guess, params.resid_outer, 10000, &update); LatticeFermionD result_MADWF(FGrid_outer); result_MADWF = Zero(); From 7905afa9f5b19b147b59eb809b87d1a27e4fc950 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Tue, 19 Jan 2021 12:32:48 +0000 Subject: [PATCH 120/201] revert changes --- tests/solver/Test_zMADWF_prec.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/solver/Test_zMADWF_prec.cc b/tests/solver/Test_zMADWF_prec.cc index f18e1d86..d1168764 100644 --- a/tests/solver/Test_zMADWF_prec.cc +++ b/tests/solver/Test_zMADWF_prec.cc @@ -52,7 +52,7 @@ struct TestParams{ bool zmobius_inner; double lambda_max; //upper bound of H_T eigenvalue range required to generate zMobius approximation - TestParams(): load_config(false), config_file("ckpoint_lat.1000"), mass(0.01), + TestParams(): load_config(true), config_file("ckpoint_lat.1000"), mass(0.01), Ls_outer(24), b_plus_c_outer(2.0), resid_outer(1e-8), Ls_inner(12), b_plus_c_inner(1.0), resid_inner(1e-8), zmobius_inner(true), lambda_max(1.42), outer_precon("Standard"), inner_precon("Standard") {} @@ -246,7 +246,7 @@ void run(const TestParams ¶ms){ typename RunParamsInner::SchurSolverType SchurSolver_inner(CG_inner); ZeroGuesser Guess; - MADWF > madwf(D_outer, D_inner, PV_outer, SchurSolver_inner, Guess, params.resid_outer, 10000, &update); + MADWF > madwf(D_outer, D_inner, PV_outer, SchurSolver_inner, Guess, params.resid_outer, 100, &update); LatticeFermionD result_MADWF(FGrid_outer); result_MADWF = Zero(); From 96dd7a8fbd66d438618962ca930e7bd1eef34d08 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 16 Nov 2020 17:15:34 +0100 Subject: [PATCH 121/201] Flop cout matches DiRAC-ITT-2020 --- benchmarks/Benchmark_ITT.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 032535b3..5d602ce9 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -445,7 +445,7 @@ public: // 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8 // 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2 // double flops=(1344.0*volume)/2; -#if 1 +#if 0 double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2; #else double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns + 2*Nd*Nc*Ns*2; From a673b6a54da17b319d8f6707893a9a3f4005be32 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Thu, 28 Jan 2021 14:15:09 +0000 Subject: [PATCH 122/201] prettify --- Grid/qcd/utils/BaryonUtils.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index d6b48ba0..56c5781d 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -513,11 +513,11 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, GridBase *grid = q1_left.Grid(); - autoView(vbaryon_corr , baryon_corr , AcceleratorWrite); - autoView( vcorr_read , baryon_corr , AcceleratorRead); - autoView( v1 , q1_left , AcceleratorRead); - autoView( v2 , q2_left , AcceleratorRead); - autoView( v3 , q3_left , AcceleratorRead); + autoView( vbaryon_corr , baryon_corr , AcceleratorWrite); + autoView( vcorr_read , baryon_corr , AcceleratorRead); + autoView( v1 , q1_left , AcceleratorRead); + autoView( v2 , q2_left , AcceleratorRead); + autoView( v3 , q3_left , AcceleratorRead); Real bytes =0.; bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real)); @@ -564,11 +564,11 @@ void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, GridBase *grid = q1_left.Grid(); - autoView(vbaryon_corr , baryon_corr , AcceleratorWrite); - autoView( vcorr_read , baryon_corr , AcceleratorRead); - autoView( v1 , q1_left , AcceleratorRead); - autoView( v2 , q2_left , AcceleratorRead); - autoView( v3 , q3_left , AcceleratorRead); + autoView( vbaryon_corr , baryon_corr , AcceleratorWrite); + autoView( vcorr_read , baryon_corr , AcceleratorRead); + autoView( v1 , q1_left , AcceleratorRead); + autoView( v2 , q2_left , AcceleratorRead); + autoView( v3 , q3_left , AcceleratorRead); accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto D1 = v1(ss); @@ -941,10 +941,10 @@ void BaryonUtils::BaryonGamma3pt( GridBase *grid = q_tf.Grid(); - autoView( vcorr , stn_corr , AcceleratorWrite); + autoView( vcorr , stn_corr , AcceleratorWrite); autoView( vcorr_read , stn_corr , AcceleratorRead); - autoView( vq_ti , q_ti , AcceleratorRead); - autoView( vq_tf , q_tf , AcceleratorRead); + autoView( vq_ti , q_ti , AcceleratorRead); + autoView( vq_tf , q_tf , AcceleratorRead); Vector my_Dq_spec{Dq_spec1,Dq_spec2}; mobj * Dq_spec_p = &my_Dq_spec[0]; From bc496dd8440449a2e9d1bc5dce074192640cd094 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Thu, 28 Jan 2021 14:29:56 +0000 Subject: [PATCH 123/201] change back benchmark_ITT --- benchmarks/Benchmark_ITT.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 5d602ce9..032535b3 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -445,7 +445,7 @@ public: // 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8 // 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2 + Nd*Nc*Ns*2 // double flops=(1344.0*volume)/2; -#if 0 +#if 1 double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns + Nd*Nc*Ns*2; #else double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns + 2*Nd*Nc*Ns*2; From 019ffe17d4f1ba9d167cb45f62ea7a0df0c19adc Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Tue, 2 Feb 2021 11:32:23 +0100 Subject: [PATCH 124/201] Allow for GPU vector width beyond 64 --- Grid/util/Coordinate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/util/Coordinate.h b/Grid/util/Coordinate.h index 004fbc72..89f73264 100644 --- a/Grid/util/Coordinate.h +++ b/Grid/util/Coordinate.h @@ -88,7 +88,7 @@ public: // Coordinate class, maxdims = 8 for now. //////////////////////////////////////////////////////////////// #define GRID_MAX_LATTICE_DIMENSION (8) -#define GRID_MAX_SIMD (16) +#define GRID_MAX_SIMD (sizeof(vInteger)/sizeof(Integer)) static constexpr int MaxDims = GRID_MAX_LATTICE_DIMENSION; From 9b9a53f87066b5c67b607f56cb73c56efa5d4243 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Tue, 2 Feb 2021 13:06:43 +0000 Subject: [PATCH 125/201] ... --- Grid/qcd/utils/BaryonUtils.h | 53 ++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 8a4ff6ac..94cc07b1 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -1350,15 +1350,19 @@ void BaryonUtils::XiToSigmaQ1EyeSite(const mobj &Dq_loop, // \gamma_\mu^L * Dq_loop auto trGDq = TensorRemove(trace(Gamma_H * Dq_loop)); + Real ee; + for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = epsilon[ie_s][0]; //a - int b_s = epsilon[ie_s][1]; //b - int c_s = epsilon[ie_s][2]; //c + int a_s = (ie_s < 3 ? ie_s : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a' + int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b' + int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_s = (ie_s < 3 ? 1 : -1); for (int ie_x=0; ie_x < 6 ; ie_x++){ - int a_x = epsilon[ie_x][0]; //a' - int b_x = epsilon[ie_x][1]; //b' - int c_x = epsilon[ie_x][2]; //c' - auto ee_GD = epsilon_sgn[ie_s] * epsilon_sgn[ie_x] * trGDq; + int a_x = (ie_x < 3 ? ie_x : (6-ie_x)%3 ); //epsilon[ie_x][0]; //a' + int b_x = (ie_x < 3 ? (ie_x+1)%3 : (8-ie_x)%3 ); //epsilon[ie_x][1]; //b' + int c_x = (ie_x < 3 ? (ie_x+2)%3 : (7-ie_x)%3 ); //epsilon[ie_x][2]; //c' + int eSgn_x = (ie_x < 3 ? 1 : -1); + ee = Real(eSgn_s * eSgn_x); for (int alpha_x=0; alpha_x::XiToSigmaEye(const PropagatorField &qq_loop, const Gamma GammaB_xi, const Gamma GammaB_sigma, const std::string op, - SpinMatrixField &stn_corr) + SpinMatrixField &xts_corr) { assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); @@ -1459,24 +1463,31 @@ void BaryonUtils::XiToSigmaEye(const PropagatorField &qq_loop, GridBase *grid = qs_ti.Grid(); - autoView( vcorr, stn_corr, CpuWrite); - autoView( vq_loop , qq_loop, CpuRead); - autoView( vd_tf , qd_tf, CpuRead); - autoView( vs_ti , qs_ti, CpuRead); + autoView( vcorr , xts_corr , AcceleratorWrite); + autoView( vq_loop , qq_loop , AcceleratorRead); + autoView( vd_tf , qd_tf , AcceleratorRead); + autoView( vs_ti , qs_ti , AcceleratorRead); + + bool doQ1 = (op == "Q1"); + bool doQ2 = (op == "Q2"); + + Vector my_Dq_spec{Dd_spec,Ds_spec}; + mobj * Dq_spec_p = &my_Dq_spec[0]; accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_loop = vq_loop[ss]; - auto Dd_tf = vd_tf[ss]; - auto Ds_ti = vs_ti[ss]; - sobj result=Zero(); - if(op == "Q1"){ - XiToSigmaQ1EyeSite(Dq_loop,Dd_spec,Ds_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_xi,GammaB_sigma,result); - } else if(op == "Q2"){ - XiToSigmaQ2EyeSite(Dq_loop,Dd_spec,Ds_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_xi,GammaB_sigma,result); + auto Dq_loop = vq_loop(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + if(doQ1){ + XiToSigmaQ1EyeSite(Dq_loop,Dq_spec_p[0],Dq_spec_p[1],Dd_tf,Ds_ti,Gamma_H,GammaB_xi,GammaB_sigma,result); + } else if(doQ2){ + XiToSigmaQ2EyeSite(Dq_loop,Dq_spec_p[0],Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_xi,GammaB_sigma,result); } else { assert(0 && "Weak Operator not correctly specified"); } - vcorr[ss] = result; + coalescedWrite(vcorr[ss],result); } );//end loop over lattice sites } From 3215d88a91214bc6bbed75cf21cf1a4a28b0bbfb Mon Sep 17 00:00:00 2001 From: Michael Marshall <43034299+mmphys@users.noreply.github.com> Date: Wed, 3 Feb 2021 15:17:03 +0000 Subject: [PATCH 126/201] Simplify syntax with Grid::EnableIf post code review. Updated EnableIf so that ReturnType defaults to void in same way as std::enable_if see https://en.cppreference.com/w/cpp/types/enable_if --- Grid/lattice/Lattice_transfer.h | 9 +++------ Grid/simd/Grid_vector_types.h | 4 ++-- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index c91fa4d1..5a26cce9 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -98,8 +98,7 @@ accelerator_inline void convertType(ComplexF & out, const std::complex & } template -accelerator_inline typename std::enable_if::value>::type -convertType(T & out, const T & in) { +accelerator_inline EnableIf> convertType(T & out, const T & in) { out = in; } @@ -138,14 +137,12 @@ accelerator_inline void convertType(iScalar & out, const iScalar & in) { } template -accelerator_inline typename std::enable_if::value>::type -convertType(T1 & out, const iScalar & in) { +accelerator_inline NotEnableIf> convertType(T1 & out, const iScalar & in) { convertType(out,in._internal); } template -accelerator_inline typename std::enable_if::value>::type -convertType(iScalar & out, const T2 & in) { +accelerator_inline NotEnableIf> convertType(iScalar & out, const T2 & in) { convertType(out._internal,in); } diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index c07077a3..4f952bb2 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -208,8 +208,8 @@ struct RealPart > { ////////////////////////////////////// // type alias used to simplify the syntax of std::enable_if template using Invoke = typename T::type; -template using EnableIf = Invoke >; -template using NotEnableIf = Invoke >; +template using EnableIf = Invoke >; +template using NotEnableIf = Invoke >; //////////////////////////////////////////////////////// // Check for complexity with type traits From 4705aa541d62e16b070452e4a3f329d9f9565afa Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Thu, 4 Feb 2021 14:25:55 +0100 Subject: [PATCH 127/201] Allow user to configure ShmDims via environment variables --- Grid/communicator/SharedMemoryMPI.cc | 18 ++++++++++++++++++ Grid/util/Init.cc | 2 +- Grid/util/Init.h | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index a12418e6..466f6a1e 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -7,6 +7,7 @@ Copyright (C) 2015 Author: Peter Boyle +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -169,6 +170,23 @@ static inline int divides(int a,int b) } void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) { + //////////////////////////////////////////////////////////////// + // Allow user to configure through environment variable + //////////////////////////////////////////////////////////////// + char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str()); + if ( str ) { + std::vector IntShmDims; + GridCmdOptionIntVector(std::string(str),IntShmDims); + assert(IntShmDims.size() == WorldDims.size()); + long ShmSize = 1; + for (int dim=0;dim & vec) } template -void GridCmdOptionIntVector(std::string &str,VectorInt & vec) +void GridCmdOptionIntVector(const std::string &str,VectorInt & vec) { vec.resize(0); std::stringstream ss(str); diff --git a/Grid/util/Init.h b/Grid/util/Init.h index dad963a0..4eb8f06c 100644 --- a/Grid/util/Init.h +++ b/Grid/util/Init.h @@ -55,7 +55,7 @@ template std::string GridCmdVectorIntToString(const VectorInt & vec); void GridCmdOptionCSL(std::string str,std::vector & vec); template -void GridCmdOptionIntVector(std::string &str,VectorInt & vec); +void GridCmdOptionIntVector(const std::string &str,VectorInt & vec); void GridCmdOptionInt(std::string &str,int & val); From cd99edcc5f0e3b12106c07652f073eeb5be985c7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 4 Feb 2021 18:25:49 -0500 Subject: [PATCH 128/201] maxLocalNorm2() --- Grid/communicator/Communicator_base.h | 3 +- Grid/communicator/Communicator_mpi3.cc | 10 +++++ Grid/communicator/Communicator_none.cc | 2 + Grid/lattice/Lattice_reduction.h | 53 +++++++++++++++++++++++++- tests/core/Test_main.cc | 16 +++++++- 5 files changed, 81 insertions(+), 3 deletions(-) diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index bb06d43f..a15f9789 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -1,4 +1,3 @@ - /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -108,6 +107,8 @@ public: //////////////////////////////////////////////////////////// // Reduction //////////////////////////////////////////////////////////// + void GlobalMax(RealD &); + void GlobalMax(RealF &); void GlobalSum(RealF &); void GlobalSumVector(RealF *,int N); void GlobalSum(RealD &); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index c6543851..5713fe35 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -275,6 +275,16 @@ void CartesianCommunicator::GlobalXOR(uint64_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); assert(ierr==0); } +void CartesianCommunicator::GlobalMax(float &f) +{ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalMax(double &d) +{ + int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); + assert(ierr==0); +} void CartesianCommunicator::GlobalSum(float &f){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); assert(ierr==0); diff --git a/Grid/communicator/Communicator_none.cc b/Grid/communicator/Communicator_none.cc index 6cb431a2..beb2cc97 100644 --- a/Grid/communicator/Communicator_none.cc +++ b/Grid/communicator/Communicator_none.cc @@ -67,6 +67,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) CartesianCommunicator::~CartesianCommunicator(){} +void CartesianCommunicator::GlobalMax(float &){} +void CartesianCommunicator::GlobalMax(double &){} void CartesianCommunicator::GlobalSum(float &){} void CartesianCommunicator::GlobalSumVector(float *,int N){} void CartesianCommunicator::GlobalSum(double &){} diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index c2955485..7338fd41 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -96,8 +96,34 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites) ssobj ret = ssum; return ret; } +/* +Threaded max, don't use for now +template +inline Double max(const Double *arg, Integer osites) +{ + // const int Nsimd = vobj::Nsimd(); + const int nthread = GridThread::GetThreads(); - + std::vector maxarray(nthread); + + thread_for(thr,nthread, { + int nwork, mywork, myoff; + nwork = osites; + GridThread::GetWork(nwork,thr,mywork,myoff); + Double max=arg[0]; + for(int ss=myoff;ss max ) max = arg[ss]; + } + maxarray[thr]=max; + }); + + Double tmax=maxarray[0]; + for(int i=0;itmax) tmax = maxarray[i]; + } + return tmax; +} +*/ template inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) { @@ -140,6 +166,31 @@ template inline RealD norm2(const Lattice &arg){ ComplexD nrm = innerProduct(arg,arg); return real(nrm); } +template inline RealD maxLocalNorm2(const Lattice &arg) +{ + typedef typename vobj::tensor_reduced vscalar; + typedef typename vobj::scalar_object scalar; + typedef typename getPrecision::real_scalar_type rscalar; + + Lattice inner = localNorm2(arg); + + auto grid = arg.Grid(); + + RealD max; + for(int l=0;llSites();l++){ + Coordinate coor; + scalar val; + RealD r; + grid->LocalIndexToLocalCoor(l,coor); + peekLocalSite(val,inner,coor); + r=real(TensorRemove(val)); + if( (l==0) || (r>max)){ + max=r; + } + } + grid->GlobalMax(max); + return max; +} // Double inner product template diff --git a/tests/core/Test_main.cc b/tests/core/Test_main.cc index d7ed04ba..d3e6bfbd 100644 --- a/tests/core/Test_main.cc +++ b/tests/core/Test_main.cc @@ -231,6 +231,19 @@ int main(int argc, char **argv) { scalar = localInnerProduct(cVec, cVec); scalar = localNorm2(cVec); + std::cout << "Testing maxLocalNorm2" < shiftcoor = coor; shiftcoor[dir] = (shiftcoor[dir] + shift + latt_size[dir]) % - (latt_size[dir] / mpi_layout[dir]); + (latt_size[dir]); + // (latt_size[dir] / mpi_layout[dir]); std::vector rl(4); for (int dd = 0; dd < 4; dd++) { From eda9ab487babbf2409fef80fd69f3b60ed532480 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 8 Feb 2021 10:47:22 -0500 Subject: [PATCH 129/201] MADWF 5d source option for hadrons - look at Grid of source Abort on GPU error --- Grid/qcd/action/fermion/MADWF.h | 14 +++++++++++--- Grid/threads/Accelerator.cc | 1 + Grid/threads/Accelerator.h | 3 +++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/Grid/qcd/action/fermion/MADWF.h b/Grid/qcd/action/fermion/MADWF.h index 6b3c6e71..5d17e865 100644 --- a/Grid/qcd/action/fermion/MADWF.h +++ b/Grid/qcd/action/fermion/MADWF.h @@ -85,7 +85,7 @@ class MADWF maxiter =_maxiter; }; - void operator() (const FermionFieldo &src4,FermionFieldo &sol5) + void operator() (const FermionFieldo &src,FermionFieldo &sol5) { std::cout << GridLogMessage<< " ************************************************" << std::endl; std::cout << GridLogMessage<< " MADWF-like algorithm " << std::endl; @@ -114,8 +114,16 @@ class MADWF /////////////////////////////////////// //Import source, include Dminus factors /////////////////////////////////////// - Mato.ImportPhysicalFermionSource(src4,b); - std::cout << GridLogMessage << " src4 " < NAMESPACE_BEGIN(Grid); +int acceleratorAbortOnGpuError=1; uint32_t accelerator_threads=2; uint32_t acceleratorThreads(void) {return accelerator_threads;}; void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 6232aea8..59645546 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -100,6 +100,8 @@ void acceleratorInit(void); #define accelerator __host__ __device__ #define accelerator_inline __host__ __device__ inline +extern int acceleratorAbortOnGpuError; + accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT return threadIdx.z; @@ -140,6 +142,7 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) printf("Cuda error %s \n", cudaGetErrorString( err )); \ puts(__FILE__); \ printf("Line %d\n",__LINE__); \ + if (acceleratorAbortOnGpuError) assert(err==cudaSuccess); \ } \ } From 55de69a56953f12380e3a276262ef0f547b5e28c Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Mon, 8 Feb 2021 12:03:16 -0500 Subject: [PATCH 130/201] Fixed compile issues with maxLocalNorm2 for non-scalar lattices maxLocalNorm2 test now reuses the random field --- Grid/lattice/Lattice_reduction.h | 7 ++++--- tests/core/Test_main.cc | 9 +++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 7338fd41..0a5fbcb6 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -166,11 +166,12 @@ template inline RealD norm2(const Lattice &arg){ ComplexD nrm = innerProduct(arg,arg); return real(nrm); } + +//The global maximum of the site norm2 template inline RealD maxLocalNorm2(const Lattice &arg) { - typedef typename vobj::tensor_reduced vscalar; - typedef typename vobj::scalar_object scalar; - typedef typename getPrecision::real_scalar_type rscalar; + typedef typename vobj::tensor_reduced vscalar; //iScalar > > + typedef typename vscalar::scalar_object scalar; //iScalar > > Lattice inner = localNorm2(arg); diff --git a/tests/core/Test_main.cc b/tests/core/Test_main.cc index d3e6bfbd..6e316aa6 100644 --- a/tests/core/Test_main.cc +++ b/tests/core/Test_main.cc @@ -232,12 +232,13 @@ int main(int argc, char **argv) { scalar = localNorm2(cVec); std::cout << "Testing maxLocalNorm2" < Date: Sun, 14 Feb 2021 21:27:54 +0000 Subject: [PATCH 131/201] Seems the intention with AutoConf produced Grid/Config.h was to use sed to translate standard PACKAGE_ #defines into GRID_ however due to missing '' after -i this hasn't been working. Perhaps it is too late to fix this, since we don't know who/what is relying on this downstream? ... but if they are, and AutoConf is being used, then likely these #defines have just been redefined anyway. Seems reasonable to redefine PACKAGE and VERSION as well, as none of these macros are used throughout Grid or Hadrons. --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index f077ca93..7d95e4e2 100644 --- a/configure.ac +++ b/configure.ac @@ -7,7 +7,7 @@ AM_INIT_AUTOMAKE([subdir-objects 1.13]) AM_EXTRA_RECURSIVE_TARGETS([tests bench]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_SRCDIR([Grid/Grid.h]) -AC_CONFIG_HEADERS([Grid/Config.h],[sed -i 's|PACKAGE_|GRID_|' Grid/Config.h]) +AC_CONFIG_HEADERS([Grid/Config.h],[[sed -i '' -e 's|PACKAGE_|GRID_|' -e 's|[[:space:]]PACKAGE[[:space:]]| GRID_PACKAGE |' -e 's|[[:space:]]VERSION[[:space:]]| GRID_PACKAGE_VERSION |' Grid/Config.h]]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) ################ Get git info From 35114c9e629c53546c8e95edd6def7b3e1692c7a Mon Sep 17 00:00:00 2001 From: Michael Marshall <43034299+mmphys@users.noreply.github.com> Date: Wed, 17 Feb 2021 13:24:15 +0000 Subject: [PATCH 132/201] Mac OS (Darwin) sed -i flag for in-place editing differs from posix / gnu --- configure.ac | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 7d95e4e2..702ce826 100644 --- a/configure.ac +++ b/configure.ac @@ -7,7 +7,12 @@ AM_INIT_AUTOMAKE([subdir-objects 1.13]) AM_EXTRA_RECURSIVE_TARGETS([tests bench]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_SRCDIR([Grid/Grid.h]) -AC_CONFIG_HEADERS([Grid/Config.h],[[sed -i '' -e 's|PACKAGE_|GRID_|' -e 's|[[:space:]]PACKAGE[[:space:]]| GRID_PACKAGE |' -e 's|[[:space:]]VERSION[[:space:]]| GRID_PACKAGE_VERSION |' Grid/Config.h]]) +AC_CONFIG_HEADERS([Grid/Config.h],[[$SED_INPLACE -e 's|PACKAGE_|GRID_|' -e 's|[[:space:]]PACKAGE[[:space:]]| GRID_PACKAGE |' -e 's|[[:space:]]VERSION[[:space:]]| GRID_PACKAGE_VERSION |' Grid/Config.h]], + [if test x"$host_os" == x"${host_os#darwin}" ; then] + [SED_INPLACE="sed -i"] + [else] + [SED_INPLACE="sed -i .bak"] + [fi]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) ################ Get git info From 86b58d5aff2adac0b41b4fc7be22e77dfbd583e2 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Thu, 18 Feb 2021 12:04:32 +0000 Subject: [PATCH 133/201] changed if and accelerator_for - no runtime errors any more --- Grid/qcd/utils/BaryonUtils.h | 129 ++++++++++++++++++----------------- 1 file changed, 68 insertions(+), 61 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 56c5781d..4ac5f685 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -513,19 +513,18 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, GridBase *grid = q1_left.Grid(); - autoView( vbaryon_corr , baryon_corr , AcceleratorWrite); - autoView( vcorr_read , baryon_corr , AcceleratorRead); - autoView( v1 , q1_left , AcceleratorRead); - autoView( v2 , q2_left , AcceleratorRead); - autoView( v3 , q3_left , AcceleratorRead); + autoView(vbaryon_corr , baryon_corr , AcceleratorWrite); + autoView( v1 , q1_left , AcceleratorRead); + autoView( v2 , q2_left , AcceleratorRead); + autoView( v3 , q3_left , AcceleratorRead); Real bytes =0.; bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real)); for (int ie=0; ie < 6 ; ie++){ if(ie==0 or ie==3){ - //bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie]; + bytes += ( wick_contractions & (1 << ie) ) ? grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) : 0.; } else{ - //bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie]; + bytes += ( wick_contractions & (1 << ie) ) ? grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) : 0.; } } Real t=0.; @@ -535,8 +534,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, auto D1 = v1(ss); auto D2 = v2(ss); auto D3 = v3(ss); - //typedef decltype(coalescedRead(vbaryon_corr[0])) cVec; - typedef decltype(coalescedRead(vcorr_read[0])) cVec; + typedef decltype(coalescedRead(vbaryon_corr[0])) cVec; cVec result=Zero(); BaryonSite(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result); coalescedWrite(vbaryon_corr[ss],result); @@ -561,21 +559,19 @@ void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - + GridBase *grid = q1_left.Grid(); - autoView( vbaryon_corr , baryon_corr , AcceleratorWrite); - autoView( vcorr_read , baryon_corr , AcceleratorRead); - autoView( v1 , q1_left , AcceleratorRead); - autoView( v2 , q2_left , AcceleratorRead); - autoView( v3 , q3_left , AcceleratorRead); + autoView(vbaryon_corr , baryon_corr , AcceleratorWrite); + autoView( v1 , q1_left , AcceleratorRead); + autoView( v2 , q2_left , AcceleratorRead); + autoView( v3 , q3_left , AcceleratorRead); accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto D1 = v1(ss); auto D2 = v2(ss); auto D3 = v3(ss); - //typedef decltype(coalescedRead(vbaryon_corr[0])) spinor; - typedef decltype(coalescedRead(vcorr_read[0])) spinor; + typedef decltype(coalescedRead(vbaryon_corr[0])) spinor; spinor result=Zero(); BaryonSiteMatrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result); coalescedWrite(vbaryon_corr[ss],result); @@ -941,10 +937,9 @@ void BaryonUtils::BaryonGamma3pt( GridBase *grid = q_tf.Grid(); - autoView( vcorr , stn_corr , AcceleratorWrite); - autoView( vcorr_read , stn_corr , AcceleratorRead); - autoView( vq_ti , q_ti , AcceleratorRead); - autoView( vq_tf , q_tf , AcceleratorRead); + autoView( vcorr , stn_corr , AcceleratorWrite); + autoView( vq_ti , q_ti , AcceleratorRead); + autoView( vq_tf , q_tf , AcceleratorRead); Vector my_Dq_spec{Dq_spec1,Dq_spec2}; mobj * Dq_spec_p = &my_Dq_spec[0]; @@ -953,29 +948,28 @@ void BaryonUtils::BaryonGamma3pt( accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto Dq_ti = vq_ti(ss); auto Dq_tf = vq_tf(ss); - typedef decltype(coalescedRead(vcorr_read[0])) spinor; + typedef decltype(coalescedRead(vcorr[0])) spinor; spinor result=Zero(); BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec_p[0],Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - coalescedWrite(vcorr[ss],coalescedRead(vcorr_read[ss])+result); + coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); });//end loop over lattice sites - } else if (group == 2) { accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto Dq_ti = vq_ti(ss); auto Dq_tf = vq_tf(ss); - typedef decltype(coalescedRead(vcorr_read[0])) spinor; + typedef decltype(coalescedRead(vcorr[0])) spinor; spinor result=Zero(); BaryonGamma3ptGroup2Site(Dq_spec_p[0],Dq_ti,Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - coalescedWrite(vcorr[ss],coalescedRead(vcorr_read[ss])+result); + coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); });//end loop over lattice sites } else if (group == 3) { accelerator_for(ss, grid->oSites(), grid->Nsimd(), { auto Dq_ti = vq_ti(ss); auto Dq_tf = vq_tf(ss); - typedef decltype(coalescedRead(vcorr_read[0])) spinor; + typedef decltype(coalescedRead(vcorr[0])) spinor; spinor result=Zero(); BaryonGamma3ptGroup3Site(Dq_spec_p[0],Dq_spec_p[1],Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - coalescedWrite(vcorr[ss],coalescedRead(vcorr_read[ss])+result); + coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); });//end loop over lattice sites } @@ -1206,6 +1200,7 @@ void BaryonUtils::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, Real ee; + for (int ie_n=0; ie_n < 6 ; ie_n++){ int a_n = (ie_n < 3 ? ie_n : (6-ie_n)%3 ); //epsilon[ie_n][0]; //a int b_n = (ie_n < 3 ? (ie_n+1)%3 : (8-ie_n)%3 ); //epsilon[ie_n][1]; //b @@ -1250,6 +1245,7 @@ void BaryonUtils::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, }} } } + } template @@ -1275,27 +1271,32 @@ void BaryonUtils::SigmaToNucleonEye(const PropagatorField &qq_loop, autoView( vd_tf , qd_tf , AcceleratorRead); autoView( vs_ti , qs_ti , AcceleratorRead); - bool doQ1 = (op == "Q1"); - bool doQ2 = (op == "Q2"); - Vector my_Dq_spec{Du_spec}; mobj * Dq_spec_p = &my_Dq_spec[0]; - accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_loop = vq_loop(ss); - auto Dd_tf = vd_tf(ss); - auto Ds_ti = vs_ti(ss); - typedef decltype(coalescedRead(vcorr[0])) spinor; - spinor result=Zero(); - if(doQ1){ + if(op == "Q1"){ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_loop = vq_loop(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); SigmaToNucleonQ1EyeSite(Dq_loop,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); - } else if(doQ2){ + coalescedWrite(vcorr[ss],result); + });//end loop over lattice sites + } else if(op == "Q2"){ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_loop = vq_loop(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); SigmaToNucleonQ2EyeSite(Dq_loop,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); - } else { - assert(0 && "Weak Operator not correctly specified"); - } - coalescedWrite(vcorr[ss],result); - });//end loop over lattice sites + coalescedWrite(vcorr[ss],result); + });//end loop over lattice sites + } else { + assert(0 && "Weak Operator not correctly specified"); + } } template @@ -1322,29 +1323,35 @@ void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, autoView( vq_tf , qq_tf , AcceleratorRead ); autoView( vd_tf , qd_tf , AcceleratorRead ); autoView( vs_ti , qs_ti , AcceleratorRead ); - - bool doQ1 = (op == "Q1"); - bool doQ2 = (op == "Q2"); Vector my_Dq_spec{Du_spec}; mobj * Dq_spec_p = &my_Dq_spec[0]; - accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_ti = vq_ti(ss); - auto Dq_tf = vq_tf(ss); - auto Dd_tf = vd_tf(ss); - auto Ds_ti = vs_ti(ss); - typedef decltype(coalescedRead(vcorr[0])) spinor; - spinor result=Zero(); - if(doQ1){ + if(op == "Q1"){ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti(ss); + auto Dq_tf = vq_tf(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); SigmaToNucleonQ1NonEyeSite(Dq_ti,Dq_tf,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); - } else if(doQ2){ + coalescedWrite(vcorr[ss],result); + });//end loop over lattice sites + } else if(op == "Q2"){ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti(ss); + auto Dq_tf = vq_tf(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); SigmaToNucleonQ2NonEyeSite(Dq_ti,Dq_tf,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); - } else { - assert(0 && "Weak Operator not correctly specified"); - } - coalescedWrite(vcorr[ss],result); - });//end loop over lattice sites + coalescedWrite(vcorr[ss],result); + });//end loop over lattice sites + } else { + assert(0 && "Weak Operator not correctly specified"); + } } NAMESPACE_END(Grid); From 7ae030f5851e8a9d74bedb79a28ef598b4006c26 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Thu, 18 Feb 2021 13:24:50 +0000 Subject: [PATCH 134/201] changed back A2AUtils warning --- Grid/qcd/utils/A2Autils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/utils/A2Autils.h b/Grid/qcd/utils/A2Autils.h index 497927dd..b63d8571 100644 --- a/Grid/qcd/utils/A2Autils.h +++ b/Grid/qcd/utils/A2Autils.h @@ -1047,7 +1047,7 @@ A2Autils::ContractWWVV(std::vector &WWVV, { GridBase *grid = vs[0].Grid(); - //int nd = grid->_ndimension; + int nd = grid->_ndimension; int Nsimd = grid->Nsimd(); int N_t = WW_sd.dimensions()[0]; int N_s = WW_sd.dimensions()[1]; From e3d019bc2f46c8c338bc3306a1954180a8b94dd7 Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Mon, 22 Feb 2021 14:56:52 +0100 Subject: [PATCH 135/201] Enable performance counting in WilsonFermion like in others --- .../fermion/implementation/WilsonFermionImplementation.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 4977ea68..84ac25c1 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -397,6 +397,7 @@ void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, co template void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int dag) { + DhopCalls+=2; conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), out.Grid()); @@ -408,6 +409,7 @@ void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int da template void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int dag) { + DhopCalls++; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -420,6 +422,7 @@ void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int template void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int dag) { + DhopCalls++; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check From c073e62e0be2f2e421588ad76f90dce33e7154c0 Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Mon, 22 Feb 2021 15:17:07 +0100 Subject: [PATCH 136/201] Correct misleading ac help string --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 702ce826..fb0c78fc 100644 --- a/configure.ac +++ b/configure.ac @@ -130,7 +130,7 @@ esac ############### fermions AC_ARG_ENABLE([fermion-reps], - [AC_HELP_STRING([--fermion-reps=yes|no], [enable extra fermion representation support])], + [AC_HELP_STRING([--enable-fermion-reps=yes|no], [enable extra fermion representation support])], [ac_FERMION_REPS=${enable_fermion_reps}], [ac_FERMION_REPS=yes]) AM_CONDITIONAL(BUILD_FERMION_REPS, [ test "${ac_FERMION_REPS}X" == "yesX" ]) From d5ab571a894682e36bc24488958f01a18983c7f7 Mon Sep 17 00:00:00 2001 From: Christopher Kelly Date: Tue, 23 Feb 2021 11:49:56 -0500 Subject: [PATCH 137/201] Added the ability to apply a custom "filter" to the conjugate momentum in the Integrator classes, applied both after refresh and after applying the forces Added a conjugate momentum "filter" that applies a phase to each site. With sites set to 1.0 or 0.0 this acts as a mask and enables, for example, the freezing of inactive gauge links in DDHMC Added tests/forces/Test_momentum_filter demonstrating the use of the filter to freeze boundary links --- Grid/qcd/hmc/integrators/Integrator.h | 28 ++++ Grid/qcd/hmc/integrators/MomentumFilter.h | 94 +++++++++++++ tests/forces/Test_momentum_filter.cc | 154 ++++++++++++++++++++++ 3 files changed, 276 insertions(+) create mode 100644 Grid/qcd/hmc/integrators/MomentumFilter.h create mode 100644 tests/forces/Test_momentum_filter.cc diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 70055754..77b7de52 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -33,6 +33,7 @@ directory #define INTEGRATOR_INCLUDED #include +#include "MomentumFilter.h" NAMESPACE_BEGIN(Grid); @@ -78,8 +79,19 @@ protected: RepresentationPolicy Representations; IntegratorParameters Params; + //Filters allow the user to manipulate the conjugate momentum, for example to freeze links in DDHMC + //It is applied whenever the momentum is updated / refreshed + //The default filter does nothing + MomentumFilterBase const* MomFilter; + const ActionSet as; + //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default + static MomentumFilterBase const* getDefaultMomFilter(){ + static MomentumFilterNone filter; + return &filter; + } + void update_P(Field& U, int level, double ep) { t_P[level] += ep; @@ -135,6 +147,8 @@ protected: // Force from the other representations as[level].apply(update_P_hireps, Representations, Mom, U, ep); + + MomFilter->applyFilter(Mom); } void update_U(Field& U, double ep) @@ -174,11 +188,23 @@ public: t_P.resize(levels, 0.0); t_U = 0.0; // initialization of smearer delegated outside of Integrator + + //Default the momentum filter to "do-nothing" + MomFilter = getDefaultMomFilter(); }; virtual ~Integrator() {} virtual std::string integrator_name() = 0; + + //Set the momentum filter allowing for manipulation of the conjugate momentum + void setMomentumFilter(const MomentumFilterBase &filter){ + MomFilter = &filter; + } + + //Access the conjugate momentum + const MomentaField & getMomentum() const{ return P; } + void print_parameters() { @@ -249,6 +275,8 @@ public: // Refresh the higher representation actions as[level].apply(refresh_hireps, Representations, pRNG); } + + MomFilter->applyFilter(P); } // to be used by the actionlevel class to iterate diff --git a/Grid/qcd/hmc/integrators/MomentumFilter.h b/Grid/qcd/hmc/integrators/MomentumFilter.h new file mode 100644 index 00000000..2a15d80c --- /dev/null +++ b/Grid/qcd/hmc/integrators/MomentumFilter.h @@ -0,0 +1,94 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/hmc/integrators/MomentumFilter.h + +Copyright (C) 2015 + +Author: Christopher Kelly +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +//-------------------------------------------------------------------- +#ifndef MOMENTUM_FILTER +#define MOMENTUM_FILTER + +NAMESPACE_BEGIN(Grid); + +//These filter objects allow the user to manipulate the conjugate momentum as part of the update / refresh + +template +struct MomentumFilterBase{ + virtual void applyFilter(MomentaField &P) const; +}; + +//Do nothing +template +struct MomentumFilterNone: public MomentumFilterBase{ + void applyFilter(MomentaField &P) const override{} +}; + +//Multiply each site/direction by a Lorentz vector complex number field +//Can be used to implement a mask, zeroing out sites +template +struct MomentumFilterApplyPhase: public MomentumFilterBase{ + typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type + typedef typename MomentaField::scalar_type scalar_type; //scalar complex type + typedef iVector >, Nd > LorentzScalarType; //complex phase for each site/direction + typedef Lattice LatticeLorentzScalarType; + + LatticeLorentzScalarType phase; + + MomentumFilterApplyPhase(const LatticeLorentzScalarType _phase): phase(_phase){} + + //Default to uniform field of (1,0) + MomentumFilterApplyPhase(GridBase* _grid): phase(_grid){ + LorentzScalarType one; + for(int mu=0;mu + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +//Get the mu-directected links on the upper boundary and the bulk remainder +template +void getLinksBoundaryBulk(Field &bound, Field &bulk, Field &from, const Coordinate &latt_size){ + bound = Zero(); bulk = Zero(); + for(int mu=0;mu seeds({1,2,3,4}); + + GridParallelRNG pRNG(&Grid); + pRNG.SeedFixedIntegers(seeds); + + typedef PeriodicGimplR Gimpl; + typedef WilsonGaugeAction GaugeAction; + typedef NoHirep Representation; //fundamental + typedef NoSmearing Smearing; + typedef MinimumNorm2 Omelyan; + typedef Gimpl::Field Field; + typedef MomentumFilterApplyPhase Filter; + Filter filter(&Grid); + + //Setup a filter that disables link update on links passing through the global lattice boundary + typedef Filter::LatticeLorentzScalarType MaskType; + typedef Filter::LorentzScalarType MaskSiteType; + + MaskSiteType zero, one; + for(int mu=0;mu::HotConfiguration(pRNG,U); + + //Get the original links on the bulk and boundary for later use + Field Ubnd_orig(&Grid), Ubulk_orig(&Grid); + getLinksBoundaryBulk(Ubnd_orig, Ubulk_orig, U, latt_size); + + ActionSet actions(1); + double beta=6; + GaugeAction gauge_action(beta); + actions[0].push_back(&gauge_action); + + Smearing smear; + IntegratorParameters params(1,1.); //1 MD step + Omelyan integrator(&Grid, params, actions, smear); + + integrator.setMomentumFilter(filter); + + integrator.refresh(U, pRNG); //doesn't actually change the gauge field + + //Check the momentum is zero on the boundary + const auto &P = integrator.getMomentum(); + Field Pbnd(&Grid), Pbulk(&Grid); + getLinksBoundaryBulk(Pbnd, Pbulk, const_cast(P), latt_size); + + RealD Pbnd_nrm = norm2(Pbnd); //expect zero + std::cout << GridLogMessage << "After refresh, norm2 of mu-directed conjugate momentum on boundary is: " << Pbnd_nrm << " (expect 0)" << std::endl; + RealD Pbulk_nrm = norm2(Pbulk); //expect non-zero + std::cout << GridLogMessage << "After refresh, norm2 of bulk conjugate momentum is: " << Pbulk_nrm << " (expect non-zero)" << std::endl; + + //Evolve the gauge field + integrator.integrate(U); + + //Check momentum is still zero on boundary + getLinksBoundaryBulk(Pbnd, Pbulk, const_cast(P), latt_size); + + Pbnd_nrm = norm2(Pbnd); //expect zero + std::cout << GridLogMessage << "After integrate, norm2 of mu-directed conjugate momentum on boundary is: " << Pbnd_nrm << " (expect 0)" << std::endl; + Pbulk_nrm = norm2(Pbulk); //expect non-zero + std::cout << GridLogMessage << "After integrate, norm2 of bulk conjugate momentum is: " << Pbulk_nrm << " (expect non-zero)" << std::endl; + + //Get the new bulk and bound links + Field Ubnd_new(&Grid), Ubulk_new(&Grid); + getLinksBoundaryBulk(Ubnd_new, Ubulk_new, U, latt_size); + + Field Ubnd_diff = Ubnd_new - Ubnd_orig; + Field Ubulk_diff = Ubulk_new - Ubulk_orig; + + RealD Ubnd_change = norm2( Ubnd_diff ); + RealD Ubulk_change = norm2( Ubulk_diff ); + std::cout << GridLogMessage << "After integrate, norm2 of change in mu-directed boundary links is : " << Ubnd_change << " (expect 0)" << std::endl; + std::cout << GridLogMessage << "After integrate, norm2 of change in bulk links is : " << Ubulk_change << " (expect non-zero)" << std::endl; + + Grid_finalize(); +} From f9b1f240f6af103ff437b459bce2937026c0f4d3 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 26 Feb 2021 17:51:41 +0100 Subject: [PATCH 138/201] Better SIMD usage/coalescence --- Grid/lattice/Lattice_view.h | 7 ++++- Grid/simd/Grid_gpu_vec.h | 23 ++++++++++++-- Grid/simd/Simd.h | 12 ++++--- Grid/tensors/Tensor_SIMT.h | 62 +++++++++++++++++++++++++++++++++++++ Grid/threads/Accelerator.h | 53 ++++++++++++++++++++++++++----- 5 files changed, 143 insertions(+), 14 deletions(-) diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h index 3b76b921..cb568abd 100644 --- a/Grid/lattice/Lattice_view.h +++ b/Grid/lattice/Lattice_view.h @@ -67,9 +67,14 @@ public: accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; } #endif +#if 1 + // accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; + accelerator_inline vobj & operator[](size_t i) const { return this->_odata[i]; }; +#else accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; }; - +#endif + accelerator_inline uint64_t begin(void) const { return 0;}; accelerator_inline uint64_t end(void) const { return this->_odata_size; }; accelerator_inline uint64_t size(void) const { return this->_odata_size; }; diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 8e55ce2f..2c1a38e7 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -60,11 +60,25 @@ template class GpuComplex { public: pair z; - typedef decltype(z.x) real; + typedef decltype(z.x) Real; public: accelerator_inline GpuComplex() = default; - accelerator_inline GpuComplex(real re,real im) { z.x=re; z.y=im; }; + accelerator_inline GpuComplex(Real re,Real im) { z.x=re; z.y=im; }; accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;}; + accelerator_inline Real real(void) const { return z.x; }; + accelerator_inline Real imag(void) const { return z.y; }; + accelerator_inline GpuComplex &operator*=(const GpuComplex &r) { + *this = (*this) * r; + return *this; + } + accelerator_inline GpuComplex &operator+=(const GpuComplex &r) { + *this = (*this) + r; + return *this; + } + accelerator_inline GpuComplex &operator-=(const GpuComplex &r) { + *this = (*this) - r; + return *this; + } friend accelerator_inline GpuComplex operator+(const GpuComplex &lhs,const GpuComplex &rhs) { GpuComplex r ; r.z.x = lhs.z.x + rhs.z.x; @@ -157,6 +171,11 @@ typedef GpuVector GpuVectorRD; typedef GpuVector GpuVectorCD; typedef GpuVector GpuVectorI; +accelerator_inline GpuComplexF timesI(const GpuComplexF &r) { return(GpuComplexF(-r.imag(),r.real()));} +accelerator_inline GpuComplexD timesI(const GpuComplexD &r) { return(GpuComplexD(-r.imag(),r.real()));} +accelerator_inline GpuComplexF timesMinusI(const GpuComplexF &r){ return(GpuComplexF(r.imag(),-r.real()));} +accelerator_inline GpuComplexD timesMinusI(const GpuComplexD &r){ return(GpuComplexD(r.imag(),-r.real()));} + accelerator_inline float half2float(half h) { float f; diff --git a/Grid/simd/Simd.h b/Grid/simd/Simd.h index 1dc86c1b..76ca3bef 100644 --- a/Grid/simd/Simd.h +++ b/Grid/simd/Simd.h @@ -148,10 +148,14 @@ accelerator_inline void sub (ComplexF * __restrict__ y,const ComplexF * __restri accelerator_inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } //conjugate already supported for complex -accelerator_inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));} -accelerator_inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} -accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));} -accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));} +accelerator_inline ComplexF timesI(const ComplexF &r) { return(ComplexF(-r.imag(),r.real()));} +accelerator_inline ComplexD timesI(const ComplexD &r) { return(ComplexD(-r.imag(),r.real()));} +accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(ComplexF(r.imag(),-r.real()));} +accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(ComplexD(r.imag(),-r.real()));} +//accelerator_inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));} +//accelerator_inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} +//accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));} +//accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));} // define projections to real and imaginay parts accelerator_inline ComplexF projReal(const ComplexF &r){return( ComplexF(r.real(), 0.0));} diff --git a/Grid/tensors/Tensor_SIMT.h b/Grid/tensors/Tensor_SIMT.h index ec57a679..ede24fbe 100644 --- a/Grid/tensors/Tensor_SIMT.h +++ b/Grid/tensors/Tensor_SIMT.h @@ -64,6 +64,68 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ } #else + +#ifndef GRID_SYCL +// Use the scalar as our own complex on GPU +template = 0> accelerator_inline +typename vsimd::scalar_type +coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::scalar_type S; + S * __restrict__ p=(S *)&vec; + return p[lane]; +} +template = 0> accelerator_inline +typename vsimd::scalar_type +coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::scalar_type S; + + S * __restrict__ p=(S *)&vec; + int mask = vsimd::Nsimd() >> (ptype + 1); + int plane= doperm ? lane ^ mask : lane; + return p[plane]; +} +template = 0> accelerator_inline +void coalescedWrite(vsimd & __restrict__ vec, + const typename vsimd::scalar_type & __restrict__ extracted, + int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::scalar_type S; + S * __restrict__ p=(S *)&vec; + p[lane]=extracted; +} +#else +template = 0> accelerator_inline +typename vsimd::vector_type::datum +coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::vector_type::datum S; + S * __restrict__ p=(S *)&vec; + return p[lane]; +} +template = 0> accelerator_inline +typename vsimd::vector_type::datum +coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::vector_type::datum S; + + S * __restrict__ p=(S *)&vec; + int mask = vsimd::Nsimd() >> (ptype + 1); + int plane= doperm ? lane ^ mask : lane; + return p[plane]; +} +template = 0> accelerator_inline +void coalescedWrite(vsimd & __restrict__ vec, + const typename vsimd::vector_type::datum & __restrict__ extracted, + int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::vector_type::datum S; + S * __restrict__ p=(S *)&vec; + p[lane]=extracted; +} +#endif + ////////////////////////////////////////// // Extract and insert slices on the GPU ////////////////////////////////////////// diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 59645546..2b7bf53a 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -104,7 +104,7 @@ extern int acceleratorAbortOnGpuError; accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT - return threadIdx.z; + return threadIdx.x; #else return 0; #endif @@ -112,28 +112,67 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ { \ + int nt=acceleratorThreads(); \ typedef uint64_t Iterator; \ auto lambda = [=] accelerator \ (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ __VA_ARGS__; \ }; \ - int nt=acceleratorThreads(); \ - dim3 cu_threads(acceleratorThreads(),1,nsimd); \ + dim3 cu_threads(nsimd,acceleratorThreads(),1); \ dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ LambdaApply<<>>(num1,num2,nsimd,lambda); \ } +#define accelerator_for6dNB(iter1, num1, \ + iter2, num2, \ + iter3, num3, \ + iter4, num4, \ + iter5, num5, \ + iter6, num6, ... ) \ + { \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2, \ + Iterator iter3,Iterator iter4, \ + Iterator iter5,Iterator iter6) mutable { \ + __VA_ARGS__; \ + }; \ + dim3 cu_blocks (num1,num2,num3); \ + dim3 cu_threads(num4,num5,num6); \ + Lambda6Apply<<>>(num1,num2,num3,num4,num5,num6,lambda); \ + } + template __global__ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) { - uint64_t x = threadIdx.x + blockDim.x*blockIdx.x; - uint64_t y = threadIdx.y + blockDim.y*blockIdx.y; - uint64_t z = threadIdx.z; + // Weird permute is to make lane coalesce for large blocks + uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; + uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; + uint64_t z = threadIdx.x; if ( (x < num1) && (y __global__ +void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, + uint64_t num4, uint64_t num5, uint64_t num6, + lambda Lambda) +{ + uint64_t iter1 = blockIdx.x; + uint64_t iter2 = blockIdx.y; + uint64_t iter3 = blockIdx.z; + uint64_t iter4 = threadIdx.x; + uint64_t iter5 = threadIdx.y; + uint64_t iter6 = threadIdx.z; + + if ( (iter1 < num1) && (iter2 global{unum1,unum2,nsimd}; \ cgh.parallel_for( \ cl::sycl::nd_range<3>(global,local), \ - [=] (cl::sycl::nd_item<3> item) mutable { \ + [=] (cl::sycl::nd_item<3> item) /*mutable*/ { \ auto iter1 = item.get_global_id(0); \ auto iter2 = item.get_global_id(1); \ auto lane = item.get_global_id(2); \ From 442336bd96b9c0062a9c42704d2013331b3b10e1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 2 Mar 2021 14:50:51 +0100 Subject: [PATCH 139/201] Hand unrolled to use optimised code paths on GPU for coalesced reads in Wilson case. Other cases to do. This now includes comms code path. --- .../WilsonKernelsHandImplementation.h | 217 ++++++++++-------- .../WilsonKernelsImplementation.h | 12 +- 2 files changed, 123 insertions(+), 106 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index 89ae5668..b867369f 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -76,7 +76,24 @@ Author: paboyle #define REGISTER -#define LOAD_CHIMU \ +#ifdef GRID_SIMT +#define LOAD_CHIMU(ptype) \ + {const SiteSpinor & ref (in[offset]); \ + Chimu_00=coalescedReadPermute(ref()(0)(0),perm); \ + Chimu_01=coalescedReadPermute(ref()(0)(1),perm); \ + Chimu_02=coalescedReadPermute(ref()(0)(2),perm); \ + Chimu_10=coalescedReadPermute(ref()(1)(0),perm); \ + Chimu_11=coalescedReadPermute(ref()(1)(1),perm); \ + Chimu_12=coalescedReadPermute(ref()(1)(2),perm); \ + Chimu_20=coalescedReadPermute(ref()(2)(0),perm); \ + Chimu_21=coalescedReadPermute(ref()(2)(1),perm); \ + Chimu_22=coalescedReadPermute(ref()(2)(2),perm); \ + Chimu_30=coalescedReadPermute(ref()(3)(0),perm); \ + Chimu_31=coalescedReadPermute(ref()(3)(1),perm); \ + Chimu_32=coalescedReadPermute(ref()(3)(2),perm); } +#define PERMUTE_DIR(dir) ; +#else +#define LOAD_CHIMU \ {const SiteSpinor & ref (in[offset]); \ Chimu_00=ref()(0)(0);\ Chimu_01=ref()(0)(1);\ @@ -91,55 +108,55 @@ Author: paboyle Chimu_31=ref()(3)(1);\ Chimu_32=ref()(3)(2);} -#define LOAD_CHI\ - {const SiteHalfSpinor &ref(buf[offset]); \ - Chi_00 = ref()(0)(0);\ - Chi_01 = ref()(0)(1);\ - Chi_02 = ref()(0)(2);\ - Chi_10 = ref()(1)(0);\ - Chi_11 = ref()(1)(1);\ - Chi_12 = ref()(1)(2);} - -// To splat or not to splat depends on the implementation -#define MULT_2SPIN(A)\ - {auto & ref(U[sU](A)); \ - Impl::loadLinkElement(U_00,ref()(0,0)); \ - Impl::loadLinkElement(U_10,ref()(1,0)); \ - Impl::loadLinkElement(U_20,ref()(2,0)); \ - Impl::loadLinkElement(U_01,ref()(0,1)); \ - Impl::loadLinkElement(U_11,ref()(1,1)); \ - Impl::loadLinkElement(U_21,ref()(2,1)); \ - UChi_00 = U_00*Chi_00;\ - UChi_10 = U_00*Chi_10;\ - UChi_01 = U_10*Chi_00;\ - UChi_11 = U_10*Chi_10;\ - UChi_02 = U_20*Chi_00;\ - UChi_12 = U_20*Chi_10;\ - UChi_00+= U_01*Chi_01;\ - UChi_10+= U_01*Chi_11;\ - UChi_01+= U_11*Chi_01;\ - UChi_11+= U_11*Chi_11;\ - UChi_02+= U_21*Chi_01;\ - UChi_12+= U_21*Chi_11;\ - Impl::loadLinkElement(U_00,ref()(0,2)); \ - Impl::loadLinkElement(U_10,ref()(1,2)); \ - Impl::loadLinkElement(U_20,ref()(2,2)); \ - UChi_00+= U_00*Chi_02;\ - UChi_10+= U_00*Chi_12;\ - UChi_01+= U_10*Chi_02;\ - UChi_11+= U_10*Chi_12;\ - UChi_02+= U_20*Chi_02;\ - UChi_12+= U_20*Chi_12;} - - #define PERMUTE_DIR(dir) \ - permute##dir(Chi_00,Chi_00);\ + permute##dir(Chi_00,Chi_00); \ permute##dir(Chi_01,Chi_01);\ permute##dir(Chi_02,Chi_02);\ - permute##dir(Chi_10,Chi_10);\ + permute##dir(Chi_10,Chi_10); \ permute##dir(Chi_11,Chi_11);\ permute##dir(Chi_12,Chi_12); +#endif + +#define LOAD_CHI \ + {const SiteHalfSpinor &ref(buf[offset]); \ + Chi_00 = coalescedRead(ref()(0)(0)); \ + Chi_01 = coalescedRead(ref()(0)(1)); \ + Chi_02 = coalescedRead(ref()(0)(2)); \ + Chi_10 = coalescedRead(ref()(1)(0)); \ + Chi_11 = coalescedRead(ref()(1)(1)); \ + Chi_12 = coalescedRead(ref()(1)(2));} + +#define MULT_2SPIN(A)\ + {auto & ref(U[sU](A)); \ + U_00=coalescedRead(ref()(0,0)); \ + U_10=coalescedRead(ref()(1,0)); \ + U_20=coalescedRead(ref()(2,0)); \ + U_01=coalescedRead(ref()(0,1)); \ + U_11=coalescedRead(ref()(1,1)); \ + U_21=coalescedRead(ref()(2,1)); \ + UChi_00 = U_00*Chi_00; \ + UChi_10 = U_00*Chi_10; \ + UChi_01 = U_10*Chi_00; \ + UChi_11 = U_10*Chi_10; \ + UChi_02 = U_20*Chi_00; \ + UChi_12 = U_20*Chi_10; \ + UChi_00+= U_01*Chi_01; \ + UChi_10+= U_01*Chi_11; \ + UChi_01+= U_11*Chi_01; \ + UChi_11+= U_11*Chi_11; \ + UChi_02+= U_21*Chi_01; \ + UChi_12+= U_21*Chi_11; \ + U_00=coalescedRead(ref()(0,2)); \ + U_10=coalescedRead(ref()(1,2)); \ + U_20=coalescedRead(ref()(2,2)); \ + UChi_00+= U_00*Chi_02; \ + UChi_10+= U_00*Chi_12; \ + UChi_01+= U_10*Chi_02; \ + UChi_11+= U_10*Chi_12; \ + UChi_02+= U_20*Chi_02; \ + UChi_12+= U_20*Chi_12;} + // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); #define XP_PROJ \ @@ -359,7 +376,7 @@ Author: paboyle local = SE->_is_local; \ perm = SE->_permute; \ if ( local ) { \ - LOAD_CHIMU; \ + LOAD_CHIMU(PERM); \ PROJ; \ if ( perm) { \ PERMUTE_DIR(PERM); \ @@ -376,7 +393,7 @@ Author: paboyle local = SE->_is_local; \ perm = SE->_permute; \ if ( local ) { \ - LOAD_CHIMU; \ + LOAD_CHIMU(PERM); \ PROJ; \ if ( perm) { \ PERMUTE_DIR(PERM); \ @@ -401,40 +418,39 @@ Author: paboyle #define HAND_RESULT(ss) \ { \ - SiteSpinor & ref (out[ss]); \ - vstream(ref()(0)(0),result_00); \ - vstream(ref()(0)(1),result_01); \ - vstream(ref()(0)(2),result_02); \ - vstream(ref()(1)(0),result_10); \ - vstream(ref()(1)(1),result_11); \ - vstream(ref()(1)(2),result_12); \ - vstream(ref()(2)(0),result_20); \ - vstream(ref()(2)(1),result_21); \ - vstream(ref()(2)(2),result_22); \ - vstream(ref()(3)(0),result_30); \ - vstream(ref()(3)(1),result_31); \ - vstream(ref()(3)(2),result_32); \ + SiteSpinor & ref (out[ss]); \ + coalescedWrite(ref()(0)(0),result_00); \ + coalescedWrite(ref()(0)(1),result_01); \ + coalescedWrite(ref()(0)(2),result_02); \ + coalescedWrite(ref()(1)(0),result_10); \ + coalescedWrite(ref()(1)(1),result_11); \ + coalescedWrite(ref()(1)(2),result_12); \ + coalescedWrite(ref()(2)(0),result_20); \ + coalescedWrite(ref()(2)(1),result_21); \ + coalescedWrite(ref()(2)(2),result_22); \ + coalescedWrite(ref()(3)(0),result_30); \ + coalescedWrite(ref()(3)(1),result_31); \ + coalescedWrite(ref()(3)(2),result_32); \ } -#define HAND_RESULT_EXT(ss) \ - if (nmu){ \ - SiteSpinor & ref (out[ss]); \ - ref()(0)(0)+=result_00; \ - ref()(0)(1)+=result_01; \ - ref()(0)(2)+=result_02; \ - ref()(1)(0)+=result_10; \ - ref()(1)(1)+=result_11; \ - ref()(1)(2)+=result_12; \ - ref()(2)(0)+=result_20; \ - ref()(2)(1)+=result_21; \ - ref()(2)(2)+=result_22; \ - ref()(3)(0)+=result_30; \ - ref()(3)(1)+=result_31; \ - ref()(3)(2)+=result_32; \ +#define HAND_RESULT_EXT(ss) \ + { \ + SiteSpinor & ref (out[ss]); \ + coalescedWrite(ref()(0)(0),coalescedRead(ref()(0)(0))+result_00); \ + coalescedWrite(ref()(0)(1),coalescedRead(ref()(0)(1))+result_01); \ + coalescedWrite(ref()(0)(2),coalescedRead(ref()(0)(2))+result_02); \ + coalescedWrite(ref()(1)(0),coalescedRead(ref()(1)(0))+result_10); \ + coalescedWrite(ref()(1)(1),coalescedRead(ref()(1)(1))+result_11); \ + coalescedWrite(ref()(1)(2),coalescedRead(ref()(1)(2))+result_12); \ + coalescedWrite(ref()(2)(0),coalescedRead(ref()(2)(0))+result_20); \ + coalescedWrite(ref()(2)(1),coalescedRead(ref()(2)(1))+result_21); \ + coalescedWrite(ref()(2)(2),coalescedRead(ref()(2)(2))+result_22); \ + coalescedWrite(ref()(3)(0),coalescedRead(ref()(3)(0))+result_30); \ + coalescedWrite(ref()(3)(1),coalescedRead(ref()(3)(1))+result_31); \ + coalescedWrite(ref()(3)(2),coalescedRead(ref()(3)(2))+result_32); \ } - -#define HAND_DECLARATIONS(a) \ +#define HAND_DECLARATIONS(Simd) \ Simd result_00; \ Simd result_01; \ Simd result_02; \ @@ -467,18 +483,18 @@ Author: paboyle Simd U_21; #define ZERO_RESULT \ - result_00=Zero(); \ - result_01=Zero(); \ - result_02=Zero(); \ - result_10=Zero(); \ - result_11=Zero(); \ - result_12=Zero(); \ - result_20=Zero(); \ - result_21=Zero(); \ - result_22=Zero(); \ - result_30=Zero(); \ - result_31=Zero(); \ - result_32=Zero(); + result_00=S(0.0,0.0); \ + result_01=S(0.0,0.0); \ + result_02=S(0.0,0.0); \ + result_10=S(0.0,0.0); \ + result_11=S(0.0,0.0); \ + result_12=S(0.0,0.0); \ + result_20=S(0.0,0.0); \ + result_21=S(0.0,0.0); \ + result_22=S(0.0,0.0); \ + result_30=S(0.0,0.0); \ + result_31=S(0.0,0.0); \ + result_32=S(0.0,0.0); #define Chimu_00 Chi_00 #define Chimu_01 Chi_01 @@ -502,8 +518,8 @@ WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); + typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + HAND_DECLARATIONS(Simt); int offset,local,perm, ptype; StencilEntry *SE; @@ -525,8 +541,8 @@ void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView { typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); + typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + HAND_DECLARATIONS(Simt); StencilEntry *SE; int offset,local,perm, ptype; @@ -549,8 +565,8 @@ WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); + typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + HAND_DECLARATIONS(Simt); int offset,local,perm, ptype; StencilEntry *SE; @@ -572,8 +588,8 @@ void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi { typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); + typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + HAND_DECLARATIONS(Simt); StencilEntry *SE; int offset,local,perm, ptype; @@ -596,8 +612,8 @@ WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); + typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + HAND_DECLARATIONS(Simt); int offset, ptype; StencilEntry *SE; @@ -620,8 +636,8 @@ void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi { typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - - HAND_DECLARATIONS(ignore); + typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + HAND_DECLARATIONS(Simt); StencilEntry *SE; int offset, ptype; @@ -682,3 +698,4 @@ NAMESPACE_END(Grid); #undef HAND_RESULT #undef HAND_RESULT_INT #undef HAND_RESULT_EXT +#undef HAND_DECLARATIONS diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index c5f50bbb..937d13af 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -445,20 +445,20 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} -#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} -#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} -#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} #endif } @@ -476,20 +476,20 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} -#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} -#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} -#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} #endif } From 679d1d22f7e920163acf7b88515c30ceb297387b Mon Sep 17 00:00:00 2001 From: u61464 Date: Wed, 3 Mar 2021 11:21:43 -0800 Subject: [PATCH 140/201] Sycl happier --- Grid/Makefile.am | 4 ++- Grid/cartesian/Cartesian_red_black.h | 2 +- Grid/qcd/action/fermion/FermionOperatorImpl.h | 4 +-- Grid/qcd/action/fermion/WilsonCompressor.h | 24 ++++++++-------- Grid/qcd/action/fermion/WilsonImpl.h | 2 +- .../WilsonKernelsHandImplementation.h | 28 +++++++++---------- Grid/simd/Grid_gpu_vec.h | 1 + Grid/stencil/SimpleCompressor.h | 14 +++++----- Grid/stencil/Stencil.h | 14 +++++----- Grid/tensors/Tensor_SIMT.h | 4 ++- Grid/threads/Accelerator.h | 2 +- benchmarks/Benchmark_dwf_fp32.cc | 2 +- configure.ac | 11 ++++++++ 13 files changed, 64 insertions(+), 48 deletions(-) diff --git a/Grid/Makefile.am b/Grid/Makefile.am index ded6d146..7c3c151b 100644 --- a/Grid/Makefile.am +++ b/Grid/Makefile.am @@ -54,9 +54,11 @@ Version.h: version-cache include Make.inc include Eigen.inc -extra_sources+=$(ZWILS_FERMION_FILES) extra_sources+=$(WILS_FERMION_FILES) extra_sources+=$(STAG_FERMION_FILES) +if BUILD_ZMOBIUS + extra_sources+=$(ZWILS_FERMION_FILES) +endif if BUILD_GPARITY extra_sources+=$(GP_FERMION_FILES) endif diff --git a/Grid/cartesian/Cartesian_red_black.h b/Grid/cartesian/Cartesian_red_black.h index b71981f5..092d4910 100644 --- a/Grid/cartesian/Cartesian_red_black.h +++ b/Grid/cartesian/Cartesian_red_black.h @@ -36,7 +36,7 @@ static const int CbBlack=1; static const int Even =CbRed; static const int Odd =CbBlack; -accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk) +accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex,const Coordinate &rdim,const Coordinate &chk_dim_msk) { int nd=rdim.size(); Coordinate coor(nd); diff --git a/Grid/qcd/action/fermion/FermionOperatorImpl.h b/Grid/qcd/action/fermion/FermionOperatorImpl.h index b444f6dc..9345c0e6 100644 --- a/Grid/qcd/action/fermion/FermionOperatorImpl.h +++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h @@ -153,8 +153,8 @@ public: typedef typename Impl::StencilImpl StencilImpl; \ typedef typename Impl::ImplParams ImplParams; \ typedef typename Impl::StencilImpl::View_type StencilView; \ - typedef typename ViewMap::Type FermionFieldView; \ - typedef typename ViewMap::Type DoubledGaugeFieldView; + typedef const typename ViewMap::Type FermionFieldView; \ + typedef const typename ViewMap::Type DoubledGaugeFieldView; #define INHERIT_IMPL_TYPES(Base) \ INHERIT_GIMPL_TYPES(Base) \ diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 10e98f33..0760bcba 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -61,7 +61,7 @@ public: typedef typename SiteHalfSpinor::vector_type vComplexHigh; constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); - accelerator_inline int CommDatumSize(void) { + accelerator_inline int CommDatumSize(void) const { return sizeof(SiteHalfCommSpinor); } @@ -69,7 +69,7 @@ public: /* Compress includes precision change if mpi data is not same */ /*****************************************************/ template - accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) { + accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const { _SiteHalfSpinor tmp; projector::Proj(tmp,in,mu,dag); vstream(buf[o],tmp); @@ -81,7 +81,7 @@ public: accelerator_inline void Exchange(SiteHalfSpinor *mp, const SiteHalfSpinor * __restrict__ vp0, const SiteHalfSpinor * __restrict__ vp1, - Integer type,Integer o){ + Integer type,Integer o) const { SiteHalfSpinor tmp1; SiteHalfSpinor tmp2; exchange(tmp1,tmp2,vp0[o],vp1[o],type); @@ -93,7 +93,7 @@ public: /* Have a decompression step if mpi data is not same */ /*****************************************************/ accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out, - SiteHalfSpinor * __restrict__ in, Integer o) { + SiteHalfSpinor * __restrict__ in, Integer o) const { assert(0); } @@ -103,7 +103,7 @@ public: accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0, SiteHalfSpinor * __restrict__ out1, const SiteSpinor * __restrict__ in, - Integer j,Integer k, Integer m,Integer type) + Integer j,Integer k, Integer m,Integer type) const { SiteHalfSpinor temp1, temp2; SiteHalfSpinor temp3, temp4; @@ -117,7 +117,7 @@ public: /*****************************************************/ /* Pass the info to the stencil */ /*****************************************************/ - accelerator_inline bool DecompressionStep(void) { return false; } + accelerator_inline bool DecompressionStep(void) const { return false; } }; @@ -142,7 +142,7 @@ public: typedef typename SiteHalfSpinor::vector_type vComplexHigh; constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); - accelerator_inline int CommDatumSize(void) { + accelerator_inline int CommDatumSize(void) const { return sizeof(SiteHalfCommSpinor); } @@ -150,7 +150,7 @@ public: /* Compress includes precision change if mpi data is not same */ /*****************************************************/ template - accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) { + accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const { _SiteHalfSpinor hsp; SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf; projector::Proj(hsp,in,mu,dag); @@ -163,7 +163,7 @@ public: accelerator_inline void Exchange(SiteHalfSpinor *mp, SiteHalfSpinor *vp0, SiteHalfSpinor *vp1, - Integer type,Integer o){ + Integer type,Integer o) const { SiteHalfSpinor vt0,vt1; SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0; SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1; @@ -175,7 +175,7 @@ public: /*****************************************************/ /* Have a decompression step if mpi data is not same */ /*****************************************************/ - accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o){ + accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const { SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in; precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw); } @@ -186,7 +186,7 @@ public: accelerator_inline void CompressExchange(SiteHalfSpinor *out0, SiteHalfSpinor *out1, const SiteSpinor *in, - Integer j,Integer k, Integer m,Integer type){ + Integer j,Integer k, Integer m,Integer type) const { SiteHalfSpinor temp1, temp2,temp3,temp4; SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0; SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1; @@ -200,7 +200,7 @@ public: /*****************************************************/ /* Pass the info to the stencil */ /*****************************************************/ - accelerator_inline bool DecompressionStep(void) { return true; } + accelerator_inline bool DecompressionStep(void) const { return true; } }; diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index d7941d1f..94676b6b 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -72,7 +72,7 @@ public: typedef WilsonCompressor Compressor; typedef WilsonImplParams ImplParams; typedef WilsonStencil StencilImpl; - typedef typename StencilImpl::View_type StencilView; + typedef const typename StencilImpl::View_type StencilView; ImplParams Params; diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index b867369f..688cb75a 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -93,7 +93,7 @@ Author: paboyle Chimu_32=coalescedReadPermute(ref()(3)(2),perm); } #define PERMUTE_DIR(dir) ; #else -#define LOAD_CHIMU \ +#define LOAD_CHIMU(ptype) \ {const SiteSpinor & ref (in[offset]); \ Chimu_00=ref()(0)(0);\ Chimu_01=ref()(0)(1);\ @@ -482,19 +482,19 @@ Author: paboyle Simd U_11; \ Simd U_21; -#define ZERO_RESULT \ - result_00=S(0.0,0.0); \ - result_01=S(0.0,0.0); \ - result_02=S(0.0,0.0); \ - result_10=S(0.0,0.0); \ - result_11=S(0.0,0.0); \ - result_12=S(0.0,0.0); \ - result_20=S(0.0,0.0); \ - result_21=S(0.0,0.0); \ - result_22=S(0.0,0.0); \ - result_30=S(0.0,0.0); \ - result_31=S(0.0,0.0); \ - result_32=S(0.0,0.0); +#define ZERO_RESULT \ + zeroit(result_00); \ + zeroit(result_01); \ + zeroit(result_02); \ + zeroit(result_10); \ + zeroit(result_11); \ + zeroit(result_12); \ + zeroit(result_20); \ + zeroit(result_21); \ + zeroit(result_22); \ + zeroit(result_30); \ + zeroit(result_31); \ + zeroit(result_32); #define Chimu_00 Chi_00 #define Chimu_01 Chi_01 diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 2c1a38e7..b2c7588f 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -67,6 +67,7 @@ public: accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;}; accelerator_inline Real real(void) const { return z.x; }; accelerator_inline Real imag(void) const { return z.y; }; + accelerator_inline GpuComplex &operator=(const Zero &zz) { z.x = 0; z.y=0; return *this; }; accelerator_inline GpuComplex &operator*=(const GpuComplex &r) { *this = (*this) * r; return *this; diff --git a/Grid/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h index be7c89c0..2ce48369 100644 --- a/Grid/stencil/SimpleCompressor.h +++ b/Grid/stencil/SimpleCompressor.h @@ -7,20 +7,20 @@ template class SimpleCompressor { public: void Point(int) {}; - accelerator_inline int CommDatumSize(void) { return sizeof(vobj); } - accelerator_inline bool DecompressionStep(void) { return false; } - template accelerator_inline void Compress(cobj *buf,int o,const cobj &in) { buf[o]=in; } - accelerator_inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o){ + accelerator_inline int CommDatumSize(void) const { return sizeof(vobj); } + accelerator_inline bool DecompressionStep(void) const { return false; } + template accelerator_inline void Compress(cobj *buf,int o,const cobj &in) const { buf[o]=in; } + accelerator_inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o) const { exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); } - accelerator_inline void Decompress(vobj *out,vobj *in, int o){ assert(0); } + accelerator_inline void Decompress(vobj *out,vobj *in, int o) const { assert(0); } accelerator_inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in, - int j,int k, int m,int type){ + int j,int k, int m,int type) const { exchange(out0[j],out1[j],in[k],in[m],type); } // For cshift. Cshift should drop compressor coupling altogether // because I had to decouple the code from the Stencil anyway - accelerator_inline vobj operator() (const vobj &arg) { + accelerator_inline vobj operator() (const vobj &arg) const { return arg; } }; diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 23fc8203..58cebed3 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -147,16 +147,16 @@ class CartesianStencilAccelerator { cobj* u_recv_buf_p; cobj* u_send_buf_p; - accelerator_inline cobj *CommBuf(void) { return u_recv_buf_p; } + accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; } - accelerator_inline int GetNodeLocal(int osite,int point) { + accelerator_inline int GetNodeLocal(int osite,int point) const { return this->_entries_p[point+this->_npoints*osite]._is_local; } - accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) { + accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const { ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite]; } - accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { + accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const { uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; local = this->_entries_p[ent]._is_local; perm = this->_entries_p[ent]._permute; @@ -168,14 +168,14 @@ class CartesianStencilAccelerator { } } - accelerator_inline uint64_t GetPFInfo(int ent,uint64_t base) { + accelerator_inline uint64_t GetPFInfo(int ent,uint64_t base) const { uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; int local = this->_entries_p[ent]._is_local; if (local) return base + this->_entries_p[ent]._byte_offset; else return cbase + this->_entries_p[ent]._byte_offset; } - accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane) + accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane) const { Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout); } @@ -221,7 +221,7 @@ public: typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; typedef typename cobj::scalar_object scalar_object; - typedef CartesianStencilView View_type; + typedef const CartesianStencilView View_type; typedef typename View_type::StencilVector StencilVector; /////////////////////////////////////////// // Helper structs diff --git a/Grid/tensors/Tensor_SIMT.h b/Grid/tensors/Tensor_SIMT.h index ede24fbe..672f385f 100644 --- a/Grid/tensors/Tensor_SIMT.h +++ b/Grid/tensors/Tensor_SIMT.h @@ -66,7 +66,7 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ #ifndef GRID_SYCL -// Use the scalar as our own complex on GPU +// Use the scalar as our own complex on GPU ... thrust::complex or std::complex template = 0> accelerator_inline typename vsimd::scalar_type coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd())) @@ -96,6 +96,8 @@ void coalescedWrite(vsimd & __restrict__ vec, p[lane]=extracted; } #else +// For SyCL have option to use GpuComplex from inside the vector type in SIMT loops +// Faster for some reason template = 0> accelerator_inline typename vsimd::vector_type::datum coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd())) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 2b7bf53a..f1a694fb 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -456,7 +456,7 @@ accelerator_inline void acceleratorSynchronise(void) __syncwarp(); #endif #ifdef GRID_SYCL - // No barrier call on SYCL?? // Option get __spir:: stuff to do warp barrier + cl::sycl::detail::workGroupBarrier(); #endif #ifdef GRID_HIP __syncthreads(); diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index cb86177e..03f3ee61 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -53,7 +53,7 @@ int main (int argc, char ** argv) int threads = GridThread::GetThreads(); Coordinate latt4 = GridDefaultLatt(); - int Ls=8; + int Ls=16; for(int i=0;i> Ls; diff --git a/configure.ac b/configure.ac index fb0c78fc..5f165412 100644 --- a/configure.ac +++ b/configure.ac @@ -140,12 +140,23 @@ AC_ARG_ENABLE([gparity], [ac_GPARITY=${enable_gparity}], [ac_GPARITY=yes]) AM_CONDITIONAL(BUILD_GPARITY, [ test "${ac_GPARITY}X" == "yesX" ]) + +AC_ARG_ENABLE([zmobius], + [AC_HELP_STRING([--enable-zmobius=yes|no], [enable Zmobius support])], + [ac_ZMOBIUS=${enable_zmobius}], [ac_ZMOBIUS=yes]) + +AM_CONDITIONAL(BUILD_ZMOBIUS, [ test "${ac_ZMOBIUS}X" == "yesX" ]) + + case ${ac_FERMION_REPS} in yes) AC_DEFINE([ENABLE_FERMION_REPS],[1],[non QCD fermion reps]);; esac case ${ac_GPARITY} in yes) AC_DEFINE([ENABLE_GPARITY],[1],[fermion actions with GPARITY BCs]);; esac +case ${ac_ZMOBIUS} in + yes) AC_DEFINE([ENABLE_ZMOBIUS],[1],[Zmobius fermion actions]);; +esac ############### Nc AC_ARG_ENABLE([Nc], [AC_HELP_STRING([--enable-Nc=2|3|4], [enable number of colours])], From 1eea9d73b984252c69869f65991f2f33d6e93193 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 3 Mar 2021 23:50:01 +0100 Subject: [PATCH 141/201] Pass serial RNG around --- Grid/qcd/action/ActionBase.h | 2 +- Grid/qcd/action/gauge/GaugeImplTypes.h | 2 +- Grid/qcd/action/gauge/PlaqPlusRectangleAction.h | 2 +- Grid/qcd/action/gauge/WilsonGaugeAction.h | 3 +-- .../qcd/action/pseudofermion/ExactOneFlavourRatio.h | 2 +- .../pseudofermion/OneFlavourEvenOddRational.h | 11 ++++++----- .../pseudofermion/OneFlavourEvenOddRationalRatio.h | 7 +++++-- Grid/qcd/action/pseudofermion/OneFlavourRational.h | 7 +++++-- .../action/pseudofermion/OneFlavourRationalRatio.h | 7 +++++-- Grid/qcd/action/pseudofermion/TwoFlavour.h | 2 +- Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h | 2 +- .../action/pseudofermion/TwoFlavourEvenOddRatio.h | 2 +- Grid/qcd/action/pseudofermion/TwoFlavourRatio.h | 2 +- Grid/qcd/action/scalar/ScalarAction.h | 2 +- Grid/qcd/action/scalar/ScalarImpl.h | 4 ++-- Grid/qcd/action/scalar/ScalarInteractionAction.h | 2 +- Grid/qcd/hmc/HMC.h | 2 +- Grid/qcd/hmc/integrators/Integrator.h | 13 ++++++------- benchmarks/Benchmark_dwf_fp32.cc | 2 +- 19 files changed, 42 insertions(+), 34 deletions(-) diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h index bff21d1d..17980ee0 100644 --- a/Grid/qcd/action/ActionBase.h +++ b/Grid/qcd/action/ActionBase.h @@ -41,7 +41,7 @@ class Action public: bool is_smeared = false; // Heatbath? - virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions + virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions virtual RealD S(const GaugeField& U) = 0; // evaluate the action virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative virtual std::string action_name() = 0; // return the action name diff --git a/Grid/qcd/action/gauge/GaugeImplTypes.h b/Grid/qcd/action/gauge/GaugeImplTypes.h index 55a20eca..2499e0e9 100644 --- a/Grid/qcd/action/gauge/GaugeImplTypes.h +++ b/Grid/qcd/action/gauge/GaugeImplTypes.h @@ -96,7 +96,7 @@ public: /////////////////////////////////////////////////////////// // Move these to another class // HMC auxiliary functions - static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) + static inline void generate_momenta(Field &P, GridSerialRNG & sRNG, GridParallelRNG &pRNG) { // Zbigniew Srocinsky thesis: // diff --git a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h index 639aca19..7690092d 100644 --- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h +++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h @@ -49,7 +49,7 @@ public: virtual std::string action_name(){return "PlaqPlusRectangleAction";} - virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {}; // noop as no pseudoferms + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {}; // noop as no pseudoferms virtual std::string LogParameters(){ std::stringstream sstream; diff --git a/Grid/qcd/action/gauge/WilsonGaugeAction.h b/Grid/qcd/action/gauge/WilsonGaugeAction.h index 40d600d2..f535b54f 100644 --- a/Grid/qcd/action/gauge/WilsonGaugeAction.h +++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h @@ -54,8 +54,7 @@ public: return sstream.str(); } - virtual void refresh(const GaugeField &U, - GridParallelRNG &pRNG){}; // noop as no pseudoferms + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){}; // noop as no pseudoferms virtual RealD S(const GaugeField &U) { RealD plaq = WilsonLoops::avgPlaquette(U); diff --git a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h index 9fc0a3b0..576a8cf6 100644 --- a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h +++ b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h @@ -124,7 +124,7 @@ NAMESPACE_BEGIN(Grid); // // As a check of rational require \Phi^dag M_{EOFA} \Phi == eta^dag M^-1/2^dag M M^-1/2 eta = eta^dag eta // - virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) + virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { Lop.ImportGauge(U); Rop.ImportGauge(U); diff --git a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h index 56dff94d..656e9b2f 100644 --- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h +++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h @@ -1,4 +1,3 @@ - /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -43,8 +42,7 @@ NAMESPACE_BEGIN(Grid); // template -class OneFlavourEvenOddRationalPseudoFermionAction - : public Action { +class OneFlavourEvenOddRationalPseudoFermionAction : public Action { public: INHERIT_IMPL_TYPES(Impl); @@ -103,7 +101,7 @@ public: return sstream.str(); } - virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) { // P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi} // = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi} // Phi = MpcdagMpc^{1/4} eta @@ -156,7 +154,10 @@ public: msCG(Mpc, PhiOdd, Y); - if ( (rand()%param.BoundsCheckFreq)==0 ) { + auto grid = FermOp.FermionGrid(); + auto r=rand(); + grid->Broadcast(0,r); + if ( (r%param.BoundsCheckFreq)==0 ) { FermionField gauss(FermOp.FermionRedBlackGrid()); gauss = PhiOdd; HighBoundCheck(Mpc,gauss,param.hi); diff --git a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h index e5f0b602..e968b8e4 100644 --- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h +++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h @@ -101,7 +101,7 @@ NAMESPACE_BEGIN(Grid); } - virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi // @@ -170,7 +170,10 @@ NAMESPACE_BEGIN(Grid); msCG_M(MdagM,X,Y); // Randomly apply rational bounds checks. - if ( (rand()%param.BoundsCheckFreq)==0 ) { + auto grid = NumOp.FermionGrid(); + auto r=rand(); + grid->Broadcast(0,r); + if ( (r%param.BoundsCheckFreq)==0 ) { FermionField gauss(NumOp.FermionRedBlackGrid()); gauss = PhiOdd; HighBoundCheck(MdagM,gauss,param.hi); diff --git a/Grid/qcd/action/pseudofermion/OneFlavourRational.h b/Grid/qcd/action/pseudofermion/OneFlavourRational.h index f6c823c9..aa647445 100644 --- a/Grid/qcd/action/pseudofermion/OneFlavourRational.h +++ b/Grid/qcd/action/pseudofermion/OneFlavourRational.h @@ -98,7 +98,7 @@ NAMESPACE_BEGIN(Grid); - virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { // P(phi) = e^{- phi^dag (MdagM)^-1/2 phi} @@ -142,7 +142,10 @@ NAMESPACE_BEGIN(Grid); msCG(MdagMOp,Phi,Y); - if ( (rand()%param.BoundsCheckFreq)==0 ) { + auto grid = FermOp.FermionGrid(); + auto r=rand(); + grid->Broadcast(0,r); + if ( (r%param.BoundsCheckFreq)==0 ) { FermionField gauss(FermOp.FermionGrid()); gauss = Phi; HighBoundCheck(MdagMOp,gauss,param.hi); diff --git a/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h index 5fae2fe9..128c869a 100644 --- a/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h +++ b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h @@ -95,7 +95,7 @@ NAMESPACE_BEGIN(Grid); } - virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi // @@ -156,7 +156,10 @@ NAMESPACE_BEGIN(Grid); msCG_M(MdagM,X,Y); // Randomly apply rational bounds checks. - if ( (rand()%param.BoundsCheckFreq)==0 ) { + auto grid = NumOp.FermionGrid(); + auto r=rand(); + grid->Broadcast(0,r); + if ( (r%param.BoundsCheckFreq)==0 ) { FermionField gauss(NumOp.FermionGrid()); gauss = Phi; HighBoundCheck(MdagM,gauss,param.hi); diff --git a/Grid/qcd/action/pseudofermion/TwoFlavour.h b/Grid/qcd/action/pseudofermion/TwoFlavour.h index f905a675..2ac97ddd 100644 --- a/Grid/qcd/action/pseudofermion/TwoFlavour.h +++ b/Grid/qcd/action/pseudofermion/TwoFlavour.h @@ -73,7 +73,7 @@ public: ////////////////////////////////////////////////////////////////////////////////////// // Push the gauge field in to the dops. Assume any BC's and smearing already applied ////////////////////////////////////////////////////////////////////////////////////// - virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) { // P(phi) = e^{- phi^dag (MdagM)^-1 phi} // Phi = Mdag eta // P(eta) = e^{- eta^dag eta} diff --git a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h index a3cf8f08..2e5208a8 100644 --- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h +++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h @@ -77,7 +77,7 @@ public: ////////////////////////////////////////////////////////////////////////////////////// // Push the gauge field in to the dops. Assume any BC's and smearing already applied ////////////////////////////////////////////////////////////////////////////////////// - virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { // P(phi) = e^{- phi^dag (MpcdagMpc)^-1 phi} // Phi = McpDag eta diff --git a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h index d1d6f336..da628c75 100644 --- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h +++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h @@ -84,7 +84,7 @@ NAMESPACE_BEGIN(Grid); } - virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi} // diff --git a/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h b/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h index 4d72faba..f584706d 100644 --- a/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h +++ b/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h @@ -64,7 +64,7 @@ public: return sstream.str(); } - virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { // P(phi) = e^{- phi^dag V (MdagM)^-1 Vdag phi} // diff --git a/Grid/qcd/action/scalar/ScalarAction.h b/Grid/qcd/action/scalar/ScalarAction.h index 34fc4fac..8b4f4f79 100644 --- a/Grid/qcd/action/scalar/ScalarAction.h +++ b/Grid/qcd/action/scalar/ScalarAction.h @@ -55,7 +55,7 @@ public: } virtual std::string action_name() {return "ScalarAction";} - virtual void refresh(const Field &U, GridParallelRNG &pRNG) {} // noop as no pseudoferms + virtual void refresh(const Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {} // noop as no pseudoferms virtual RealD S(const Field &p) { return (mass_square * 0.5 + Nd) * ScalarObs::sumphisquared(p) + diff --git a/Grid/qcd/action/scalar/ScalarImpl.h b/Grid/qcd/action/scalar/ScalarImpl.h index 403ea573..13bd6c90 100644 --- a/Grid/qcd/action/scalar/ScalarImpl.h +++ b/Grid/qcd/action/scalar/ScalarImpl.h @@ -27,7 +27,7 @@ public: typedef Field FermionField; typedef Field PropagatorField; - static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){ + static inline void generate_momenta(Field& P, GridSerialRNG &sRNG, GridParallelRNG& pRNG){ RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling gaussian(pRNG, P); P *= scale; @@ -151,7 +151,7 @@ public: out = one / out; } - static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) + static inline void generate_momenta(Field &P, GridSerialRNG & sRNG, GridParallelRNG &pRNG) { RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling #ifndef USE_FFT_ACCELERATION diff --git a/Grid/qcd/action/scalar/ScalarInteractionAction.h b/Grid/qcd/action/scalar/ScalarInteractionAction.h index 5a5f9251..e04dd486 100644 --- a/Grid/qcd/action/scalar/ScalarInteractionAction.h +++ b/Grid/qcd/action/scalar/ScalarInteractionAction.h @@ -77,7 +77,7 @@ public: virtual std::string action_name() { return "ScalarAction"; } - virtual void refresh(const Field &U, GridParallelRNG &pRNG) {} + virtual void refresh(const Field &U, GridSerialRNG & sRNG, GridParallelRNG &pRNG) {} virtual RealD S(const Field &p) { diff --git a/Grid/qcd/hmc/HMC.h b/Grid/qcd/hmc/HMC.h index f168b69a..44674ea5 100644 --- a/Grid/qcd/hmc/HMC.h +++ b/Grid/qcd/hmc/HMC.h @@ -139,7 +139,7 @@ private: // Evolution ///////////////////////////////////////////////////////// RealD evolve_hmc_step(Field &U) { - TheIntegrator.refresh(U, pRNG); // set U and initialize P and phi's + TheIntegrator.refresh(U, sRNG, pRNG); // set U and initialize P and phi's RealD H0 = TheIntegrator.S(U); // initial state action diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index 77b7de52..aa28c6c8 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -236,10 +236,9 @@ public: // over the representations struct _refresh { template - void operator()(std::vector*> repr_set, Repr& Rep, - GridParallelRNG& pRNG) { + void operator()(std::vector*> repr_set, Repr& Rep, GridSerialRNG & sRNG, GridParallelRNG& pRNG) { for (int a = 0; a < repr_set.size(); ++a){ - repr_set.at(a)->refresh(Rep.U, pRNG); + repr_set.at(a)->refresh(Rep.U, sRNG, pRNG); std::cout << GridLogDebug << "Hirep refreshing pseudofermions" << std::endl; } @@ -247,12 +246,12 @@ public: } refresh_hireps{}; // Initialization of momenta and actions - void refresh(Field& U, GridParallelRNG& pRNG) + void refresh(Field& U, GridSerialRNG & sRNG, GridParallelRNG& pRNG) { assert(P.Grid() == U.Grid()); std::cout << GridLogIntegrator << "Integrator refresh\n"; - FieldImplementation::generate_momenta(P, pRNG); + FieldImplementation::generate_momenta(P, sRNG, pRNG); // Update the smeared fields, can be implemented as observer // necessary to keep the fields updated even after a reject @@ -269,11 +268,11 @@ public: // get gauge field from the SmearingPolicy and // based on the boolean is_smeared in actionID Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); - as[level].actions.at(actionID)->refresh(Us, pRNG); + as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); } // Refresh the higher representation actions - as[level].apply(refresh_hireps, Representations, pRNG); + as[level].apply(refresh_hireps, Representations, sRNG, pRNG); } MomFilter->applyFilter(P); diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc index cb86177e..03f3ee61 100644 --- a/benchmarks/Benchmark_dwf_fp32.cc +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -53,7 +53,7 @@ int main (int argc, char ** argv) int threads = GridThread::GetThreads(); Coordinate latt4 = GridDefaultLatt(); - int Ls=8; + int Ls=16; for(int i=0;i> Ls; From d4b4de8f428c625757c31c6d265c288671a0ef5b Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Thu, 4 Mar 2021 20:01:24 +0000 Subject: [PATCH 142/201] changes --- Grid/qcd/utils/BaryonUtils.h | 60 +++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index b74a5b20..1a7a4d38 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -252,7 +252,7 @@ public: const Gamma GammaB_sigma, const Gamma GammaB_nucl, const std::string op, - SpinMatrixField &stn_corr); + SpinMatrixField &xts_corr); }; //This computes a baryon contraction on a lattice site, including the spin-trace of the correlation matrix template @@ -1435,6 +1435,7 @@ void BaryonUtils::XiToSigmaQ1EyeSite(const mobj &Dq_loop, int c_x = (ie_x < 3 ? (ie_x+2)%3 : (7-ie_x)%3 ); //epsilon[ie_x][2]; //c' int eSgn_x = (ie_x < 3 ? 1 : -1); ee = Real(eSgn_s * eSgn_x); + auto ee_GD = ee * trGDq; for (int alpha_x=0; alpha_x::XiToSigmaQ2EyeSite(const mobj &Dq_loop, // GammaB * DsGDd * GammaB auto GDsGDqGDdG = GDsGDqGDd * GammaB_sigma; + Real ee; + for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = epsilon[ie_s][0]; //a - int b_s = epsilon[ie_s][1]; //b - int c_s = epsilon[ie_s][2]; //c + int a_s = (ie_s < 3 ? ie_s : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a' + int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b' + int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_s = (ie_s < 3 ? 1 : -1); for (int ie_x=0; ie_x < 6 ; ie_x++){ - int a_x = epsilon[ie_x][0]; //a' - int b_x = epsilon[ie_x][1]; //b' - int c_x = epsilon[ie_x][2]; //c' - auto ee = epsilon_sgn[ie_s] * epsilon_sgn[ie_x]; + int a_x = (ie_x < 3 ? ie_x : (6-ie_x)%3 ); //epsilon[ie_x][0]; //a' + int b_x = (ie_x < 3 ? (ie_x+1)%3 : (8-ie_x)%3 ); //epsilon[ie_x][1]; //b' + int c_x = (ie_x < 3 ? (ie_x+2)%3 : (7-ie_x)%3 ); //epsilon[ie_x][2]; //c' + int eSgn_x = (ie_x < 3 ? 1 : -1); + ee = Real(eSgn_s * eSgn_x); for (int alpha_x=0; alpha_x::XiToSigmaEye(const PropagatorField &qq_loop, autoView( vd_tf , qd_tf , AcceleratorRead); autoView( vs_ti , qs_ti , AcceleratorRead); - bool doQ1 = (op == "Q1"); - bool doQ2 = (op == "Q2"); - Vector my_Dq_spec{Dd_spec,Ds_spec}; mobj * Dq_spec_p = &my_Dq_spec[0]; - accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_loop = vq_loop(ss); - auto Dd_tf = vd_tf(ss); - auto Ds_ti = vs_ti(ss); - typedef decltype(coalescedRead(vcorr[0])) spinor; - spinor result=Zero(); - if(doQ1){ + if(op == "Q1"){ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_loop = vq_loop(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); XiToSigmaQ1EyeSite(Dq_loop,Dq_spec_p[0],Dq_spec_p[1],Dd_tf,Ds_ti,Gamma_H,GammaB_xi,GammaB_sigma,result); - } else if(doQ2){ - XiToSigmaQ2EyeSite(Dq_loop,Dq_spec_p[0],Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_xi,GammaB_sigma,result); - } else { - assert(0 && "Weak Operator not correctly specified"); - } - coalescedWrite(vcorr[ss],result); - } );//end loop over lattice sites + coalescedWrite(vcorr[ss],result); + } );//end loop over lattice sites + } else if(op == "Q2"){ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_loop = vq_loop(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + XiToSigmaQ2EyeSite(Dq_loop,Dq_spec_p[0],Dq_spec_p[1],Dd_tf,Ds_ti,Gamma_H,GammaB_xi,GammaB_sigma,result); + coalescedWrite(vcorr[ss],result); + } );//end loop over lattice sites + } else { + assert(0 && "Weak Operator not correctly specified"); + } } From 9b15704290048b936e71c3ddd652a6eec9bf5fff Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Fri, 5 Mar 2021 10:42:32 +0000 Subject: [PATCH 143/201] tested and consitent --- Grid/qcd/utils/BaryonUtils.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index 1a7a4d38..b69865e8 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -27,7 +27,6 @@ *************************************************************************************/ /* END LEGAL */ #pragma once -//#include #include NAMESPACE_BEGIN(Grid); @@ -200,7 +199,7 @@ public: const Gamma GammaB_sigma, const Gamma GammaB_nucl, robj &result); - template + template accelerator_inline static void XiToSigmaQ1EyeSite(const mobj &Dq_loop, const mobj2 &Dd_spec, const mobj2 &Ds_spec, @@ -210,7 +209,7 @@ public: const Gamma GammaB_sigma, const Gamma GammaB_nucl, robj &result); - template + template accelerator_inline static void XiToSigmaQ2EyeSite(const mobj &Dq_loop, const mobj2 &Dd_spec, const mobj2 &Ds_spec, @@ -1395,7 +1394,7 @@ void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ template -template +template accelerator_inline void BaryonUtils::XiToSigmaQ1EyeSite(const mobj &Dq_loop, const mobj2 &Dd_spec, const mobj2 &Ds_spec, @@ -1464,7 +1463,7 @@ void BaryonUtils::XiToSigmaQ1EyeSite(const mobj &Dq_loop, * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ template -template +template accelerator_inline void BaryonUtils::XiToSigmaQ2EyeSite(const mobj &Dq_loop, const mobj2 &Dd_spec, const mobj2 &Ds_spec, From 7a19432e0b77fff8943ad6701f448ab366dab27f Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Fri, 5 Mar 2021 10:57:09 +0000 Subject: [PATCH 144/201] whitespace --- Grid/qcd/utils/BaryonUtils.h | 84 ++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index b69865e8..9d9cb508 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -191,34 +191,34 @@ public: robj &result); template accelerator_inline static void SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, - const mobj &Du_tf, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result); + const mobj &Du_tf, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); template accelerator_inline static void XiToSigmaQ1EyeSite(const mobj &Dq_loop, - const mobj2 &Dd_spec, - const mobj2 &Ds_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result); + const mobj2 &Dd_spec, + const mobj2 &Ds_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); template accelerator_inline static void XiToSigmaQ2EyeSite(const mobj &Dq_loop, - const mobj2 &Dd_spec, - const mobj2 &Ds_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result); + const mobj2 &Dd_spec, + const mobj2 &Ds_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); public: template static void SigmaToNucleonEye(const PropagatorField &qq_loop, @@ -232,26 +232,26 @@ public: SpinMatrixField &stn_corr); template static void SigmaToNucleonNonEye(const PropagatorField &qq_ti, - const PropagatorField &qq_tf, - const mobj &Du_spec, - const PropagatorField &qd_tf, - const PropagatorField &qs_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - const std::string op, - SpinMatrixField &stn_corr); + const PropagatorField &qq_tf, + const mobj &Du_spec, + const PropagatorField &qd_tf, + const PropagatorField &qs_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + const std::string op, + SpinMatrixField &stn_corr); template static void XiToSigmaEye(const PropagatorField &qq_loop, - const mobj &Dd_spec, - const mobj &Ds_spec, - const PropagatorField &qd_tf, - const PropagatorField &qs_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - const std::string op, - SpinMatrixField &xts_corr); + const mobj &Dd_spec, + const mobj &Ds_spec, + const PropagatorField &qd_tf, + const PropagatorField &qs_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + const std::string op, + SpinMatrixField &xts_corr); }; //This computes a baryon contraction on a lattice site, including the spin-trace of the correlation matrix template From aa173e29989b93445cc28655dd7d0feadb746961 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 5 Mar 2021 10:25:33 -0500 Subject: [PATCH 145/201] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 4cbae720..fff68dc6 100644 --- a/README.md +++ b/README.md @@ -149,7 +149,6 @@ If you want to build all the tests at once just use `make tests`. - `--enable-numa`: enable NUMA first touch optimisation - `--enable-simd=`: setup Grid for the SIMD target `` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. - `--enable-gen-simd-width=`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). -- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option** - `--enable-comms=`: Use `` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `). - `--disable-timers`: disable system dependent high-resolution timers. From b24181aa4f21a59e601237947c5ea465d0e1ecaf Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Fri, 5 Mar 2021 16:56:58 +0100 Subject: [PATCH 146/201] Update Coordinate.h Revert GRID_MAX_SIMD change --- Grid/util/Coordinate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/util/Coordinate.h b/Grid/util/Coordinate.h index 89f73264..004fbc72 100644 --- a/Grid/util/Coordinate.h +++ b/Grid/util/Coordinate.h @@ -88,7 +88,7 @@ public: // Coordinate class, maxdims = 8 for now. //////////////////////////////////////////////////////////////// #define GRID_MAX_LATTICE_DIMENSION (8) -#define GRID_MAX_SIMD (sizeof(vInteger)/sizeof(Integer)) +#define GRID_MAX_SIMD (16) static constexpr int MaxDims = GRID_MAX_LATTICE_DIMENSION; From a76cb005e007bf887e0573175f9e9806fecc61b1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 8 Mar 2021 13:37:57 -0500 Subject: [PATCH 147/201] Update Tensor_exp.h --- Grid/tensors/Tensor_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/tensors/Tensor_exp.h b/Grid/tensors/Tensor_exp.h index 0a1d6389..1f637d5f 100644 --- a/Grid/tensors/Tensor_exp.h +++ b/Grid/tensors/Tensor_exp.h @@ -28,7 +28,7 @@ Author: neo #ifndef GRID_MATH_EXP_H #define GRID_MATH_EXP_H -#define DEFAULT_MAT_EXP 12 +#define DEFAULT_MAT_EXP 20 NAMESPACE_BEGIN(Grid); From 4d1ea15c79d30c961272d5020404395db44365dc Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 9 Mar 2021 04:29:37 +0100 Subject: [PATCH 148/201] More verbosity. The 16bit limit on Grid.y, Grid.z is annoying --- Grid/threads/Accelerator.cc | 3 +-- Grid/threads/Accelerator.h | 7 ++++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 4bf7f395..9d9d851c 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -53,7 +53,6 @@ void acceleratorInit(void) prop = gpu_props[i]; totalDeviceMem = prop.totalGlobalMem; if ( world_rank == 0) { -#ifndef GRID_DEFAULT_GPU if ( i==rank ) { printf("AcceleratorCudaInit[%d]: ========================\n",rank); printf("AcceleratorCudaInit[%d]: Device Number : %d\n", rank,i); @@ -67,8 +66,8 @@ void acceleratorInit(void) GPU_PROP(warpSize); GPU_PROP(pciBusID); GPU_PROP(pciDeviceID); + printf("AcceleratorCudaInit[%d]: maxGridSize (%d,%d,%d)\n",rank,prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2]); } -#endif // GPU_PROP(unifiedAddressing); // GPU_PROP(l2CacheSize); // GPU_PROP(singleToDoublePrecisionPerfRatio); diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index f1a694fb..56b85c72 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -178,9 +178,10 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, cudaDeviceSynchronize(); \ cudaError err = cudaGetLastError(); \ if ( cudaSuccess != err ) { \ - printf("Cuda error %s \n", cudaGetErrorString( err )); \ - puts(__FILE__); \ - printf("Line %d\n",__LINE__); \ + printf("accelerator_barrier(): Cuda error %s \n", \ + cudaGetErrorString( err )); \ + printf("File %s Line %d\n",__FILE__,__LINE__); \ + fflush(stdout); \ if (acceleratorAbortOnGpuError) assert(err==cudaSuccess); \ } \ } From 6a429ee6d3eee71689f650782decb8948bf6fa77 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 9 Mar 2021 04:31:10 +0100 Subject: [PATCH 149/201] 2d loop hits Nvidia 16bit limit on large local vols --- Grid/cshift/Cshift_common.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Grid/cshift/Cshift_common.h b/Grid/cshift/Cshift_common.h index f2f39815..cf902b58 100644 --- a/Grid/cshift/Cshift_common.h +++ b/Grid/cshift/Cshift_common.h @@ -110,9 +110,11 @@ Gather_plane_extract(const Lattice &rhs, int n1=rhs.Grid()->_slice_stride[dimension]; if ( cbmask ==0x3){ -#ifdef ACCELERATOR_CSHIFT +#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); - accelerator_for2d(n,e1,b,e2,1,{ + accelerator_for(nn,e1*e2,1,{ + int n = nn%e1; + int b = nn/e1; int o = n*n1; int offset = b+n*e2; @@ -135,7 +137,9 @@ Gather_plane_extract(const Lattice &rhs, std::cout << " Dense packed buffer WARNING " < void Scatter_plane_merge(Lattice &rhs,ExtractPointerA int _slice_block = rhs.Grid()->_slice_block[dimension]; #ifdef ACCELERATOR_CSHIFT autoView( rhs_v , rhs, AcceleratorWrite); - accelerator_for2d(n,e1,b,e2,1,{ + accelerator_for(nn,e1*e2,1,{ + int n = nn%e1; + int b = nn/e1; int o = n*_slice_stride; int offset = b+n*_slice_block; merge(rhs_v[so+o+b],pointers,offset); @@ -274,7 +280,7 @@ template void Scatter_plane_merge(Lattice &rhs,ExtractPointerA // Case of SIMD split AND checker dim cannot currently be hit, except in // Test_cshift_red_black code. - // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME + std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout<<" Unthreaded warning -- buffer is not densely packed ??"< Date: Wed, 10 Mar 2021 02:45:22 +0100 Subject: [PATCH 150/201] Clean up test --- tests/core/Test_where.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/core/Test_where.cc b/tests/core/Test_where.cc index 050b711b..deb29865 100644 --- a/tests/core/Test_where.cc +++ b/tests/core/Test_where.cc @@ -40,9 +40,9 @@ int main (int argc, char ** argv) int N=16; - std::vector latt_size ({N,4,4}); - std::vector simd_layout({vComplexD::Nsimd(),1,1}); - std::vector mpi_layout ({1,1,1}); + std::vector latt_size ({N,N,N,N}); + std::vector simd_layout({vComplexD::Nsimd(),1,1,1}); + std::vector mpi_layout ({1,1,1,1}); int vol = 1; int nd = latt_size.size(); @@ -69,7 +69,7 @@ int main (int argc, char ** argv) for(int t=0;t Date: Wed, 10 Mar 2021 05:40:51 -0800 Subject: [PATCH 151/201] Gives 200GF/s on SyCL/DG1 8^4, doesn't uglify develop for other platforms too badly. Easy to revert to clean more C++ stylistic code. Theres a SYCL_HACK macro I will clean up later once dpcpp evolves a central nervous systems. --- Grid/qcd/action/fermion/WilsonKernels.h | 10 +- .../WilsonKernelsHandImplementation.h | 99 ++++++++++++++++--- .../WilsonKernelsImplementation.h | 20 +++- 3 files changed, 115 insertions(+), 14 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonKernels.h b/Grid/qcd/action/fermion/WilsonKernels.h index 1bac9211..68422f28 100644 --- a/Grid/qcd/action/fermion/WilsonKernels.h +++ b/Grid/qcd/action/fermion/WilsonKernels.h @@ -49,9 +49,17 @@ public: INHERIT_IMPL_TYPES(Impl); typedef FermionOperator Base; - + typedef AcceleratorVector StencilVector; public: +#ifdef GRID_SYCL +#define SYCL_HACK +#endif +#ifdef SYCL_HACK + static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf, + int ss,int sU,const SiteSpinor *in, SiteSpinor *out); +#endif + static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int interior=1,int exterior=1) ; diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index 688cb75a..fb42fe88 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -118,15 +118,6 @@ Author: paboyle #endif -#define LOAD_CHI \ - {const SiteHalfSpinor &ref(buf[offset]); \ - Chi_00 = coalescedRead(ref()(0)(0)); \ - Chi_01 = coalescedRead(ref()(0)(1)); \ - Chi_02 = coalescedRead(ref()(0)(2)); \ - Chi_10 = coalescedRead(ref()(1)(0)); \ - Chi_11 = coalescedRead(ref()(1)(1)); \ - Chi_12 = coalescedRead(ref()(1)(2));} - #define MULT_2SPIN(A)\ {auto & ref(U[sU](A)); \ U_00=coalescedRead(ref()(0,0)); \ @@ -157,6 +148,15 @@ Author: paboyle UChi_02+= U_20*Chi_02; \ UChi_12+= U_20*Chi_12;} +#define LOAD_CHI \ + {const SiteHalfSpinor &ref(buf[offset]); \ + Chi_00 = coalescedRead(ref()(0)(0)); \ + Chi_01 = coalescedRead(ref()(0)(1)); \ + Chi_02 = coalescedRead(ref()(0)(2)); \ + Chi_10 = coalescedRead(ref()(1)(0)); \ + Chi_11 = coalescedRead(ref()(1)(1)); \ + Chi_12 = coalescedRead(ref()(1)(2));} + // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); #define XP_PROJ \ @@ -370,7 +370,7 @@ Author: paboyle result_31-= UChi_11; \ result_32-= UChi_12; -#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ +#define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON) \ SE=st.GetEntry(ptype,DIR,ss); \ offset = SE->_offset; \ local = SE->_is_local; \ @@ -384,6 +384,37 @@ Author: paboyle } else { \ LOAD_CHI; \ } \ + acceleratorSynchronise(); \ + MULT_2SPIN(DIR); \ + RECON; + +#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ + SE=&st_p[DIR+8*ss]; \ + ptype=st_perm[DIR]; \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU(PERM); \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else { \ + LOAD_CHI; \ + } \ + acceleratorSynchronise(); \ + MULT_2SPIN(DIR); \ + RECON; + +#define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON) \ + SE=&st_p[DIR+8*ss]; \ + ptype=st_perm[DIR]; \ + /*SE=st.GetEntry(ptype,DIR,ss);*/ \ + offset = SE->_offset; \ + perm = SE->_permute; \ + LOAD_CHIMU(PERM); \ + PROJ; \ MULT_2SPIN(DIR); \ RECON; @@ -401,10 +432,12 @@ Author: paboyle } else if ( st.same_node[DIR] ) { \ LOAD_CHI; \ } \ + acceleratorSynchronise(); \ if (local || st.same_node[DIR] ) { \ MULT_2SPIN(DIR); \ RECON; \ - } + } \ + acceleratorSynchronise(); #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ SE=st.GetEntry(ptype,DIR,ss); \ @@ -414,7 +447,8 @@ Author: paboyle MULT_2SPIN(DIR); \ RECON; \ nmu++; \ - } + } \ + acceleratorSynchronise(); #define HAND_RESULT(ss) \ { \ @@ -511,10 +545,41 @@ Author: paboyle NAMESPACE_BEGIN(Grid); + +#ifdef SYCL_HACK +template accelerator_inline void +WilsonKernels::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf, + int ss,int sU,const SiteSpinor *in, SiteSpinor *out) +{ +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + typedef iSinglet vCplx; + // typedef decltype( coalescedRead( vCplx()()() )) Simt; + typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + + HAND_DECLARATIONS(Simt); + + int offset,local,perm, ptype; + StencilEntry *SE; + HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); + HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); +} +#endif + template accelerator_inline void WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { + auto st_p = st._entries_p; + auto st_perm = st._permute_type; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; @@ -539,6 +604,8 @@ template accelerator_inline void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { + auto st_p = st._entries_p; + auto st_perm = st._permute_type; typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; @@ -562,6 +629,8 @@ template accelerator_inline void WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { + auto st_p = st._entries_p; + auto st_perm = st._permute_type; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; @@ -586,6 +655,8 @@ template accelerator_inline void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { + auto st_p = st._entries_p; + auto st_perm = st._permute_type; typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; @@ -609,6 +680,8 @@ template accelerator_inline void WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { + auto st_p = st._entries_p; + auto st_perm = st._permute_type; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; @@ -634,6 +707,8 @@ template accelerator_inline void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { + auto st_p = st._entries_p; + auto st_perm = st._permute_type; typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index 937d13af..9228b84c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -416,7 +416,21 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S #undef LoopBody } -#define KERNEL_CALLNB(A) \ +#define KERNEL_CALL_TMP(A) \ + const uint64_t NN = Nsite*Ls; \ + auto U_p = & U_v[0]; \ + auto in_p = & in_v[0]; \ + auto out_p = & out_v[0]; \ + auto st_p = st_v._entries_p; \ + auto st_perm = st_v._permute_type; \ + accelerator_forNB( ss, NN, Simd::Nsimd(), { \ + int sF = ss; \ + int sU = ss/Ls; \ + WilsonKernels::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p); \ + }); \ + accelerator_barrier(); + +#define KERNEL_CALLNB(A) \ const uint64_t NN = Nsite*Ls; \ accelerator_forNB( ss, NN, Simd::Nsimd(), { \ int sF = ss; \ @@ -445,7 +459,11 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} +#ifdef SYCL_HACK + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl); return; } +#else if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} +#endif #ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} #endif From f786ff8d69534e7885a55a7090a26b4d28619209 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 10 Mar 2021 14:32:06 -0500 Subject: [PATCH 152/201] Extend test from Fionn, fails on A100 apparently --- tests/core/Test_where_extended.cc | 137 ++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 tests/core/Test_where_extended.cc diff --git a/tests/core/Test_where_extended.cc b/tests/core/Test_where_extended.cc new file mode 100644 index 00000000..706fd7ee --- /dev/null +++ b/tests/core/Test_where_extended.cc @@ -0,0 +1,137 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_poisson_fft.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< latt_size ({N,4,4}); + std::vector simd_layout({vComplexD::Nsimd(),1,1}); + std::vector mpi_layout ({1,1,1}); + + int vol = 1; + int nd = latt_size.size(); + for(int d=0;d({45,12,81,9})); + gaussian(RNG,rn); + + RealD nn=norm2(rn); + for(int mu=0;mu seeds4({1,2,3,4}); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); + + + std::cout<=tmin),tmpF,ZZF); + nA = nA + norm2(tmp2F); + InsertSlice(tmp2F, q_outF, s , 0); + } + + RealD nQO=norm2(q_outF); + std::cout <=tmin),tmpP,ZZP); + nA = nA + norm2(tmp2P); + InsertSlice(tmp2P, q_outP, s , 0); + } + + nQO=norm2(q_outP); + std::cout < Date: Thu, 11 Mar 2021 12:58:49 +0100 Subject: [PATCH 153/201] Fix inconsistent configure option AVX512 Before this change AVX512 enabled different instruction sets depending on the compiler: For Intel C++ Compiler Classic (ICC): AVX512F, AVX512CD, AVX512DQ, AVX512BW, AVX512VL i.e. Intel Xeon Skylake and newer For Intel ICX, gcc, clang: AVX512F, AVX512CD, AVX512ER, AVX512PF i.e. Intel Xeon Phi x200/x205 (KNL/KNM) With this commit AVX512 now only enables the common instruction sets supported by all CPUs supporting any AVX-512 instructions set: AVX512F and AVX512CD (called COMMON-AVX512 by icc) --- configure.ac | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index 5f165412..afd5cbad 100644 --- a/configure.ac +++ b/configure.ac @@ -444,7 +444,7 @@ case ${ax_cv_cxx_compiler_vendor} in SIMD_FLAGS='-mavx2 -mfma -mf16c';; AVX512) AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) - SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';; + SIMD_FLAGS='-mavx512f -mavx512cd';; SKL) AC_DEFINE([AVX512],[1],[AVX512 intrinsics for SkyLake Xeon]) SIMD_FLAGS='-march=skylake-avx512';; @@ -498,7 +498,7 @@ case ${ax_cv_cxx_compiler_vendor} in SIMD_FLAGS='-march=core-avx2 -xcore-avx2';; AVX512) AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) - SIMD_FLAGS='-xcore-avx512';; + SIMD_FLAGS='-xcommon-avx512';; KNC) AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner]) SIMD_FLAGS='';; From 82402c6a7cc3057dd0e4c2f6389ba2b347972cfc Mon Sep 17 00:00:00 2001 From: Peter Georg Date: Thu, 11 Mar 2021 13:08:40 +0100 Subject: [PATCH 154/201] Add simd option SKL for ICC --- configure.ac | 3 +++ 1 file changed, 3 insertions(+) diff --git a/configure.ac b/configure.ac index afd5cbad..4e5e33c8 100644 --- a/configure.ac +++ b/configure.ac @@ -499,6 +499,9 @@ case ${ax_cv_cxx_compiler_vendor} in AVX512) AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) SIMD_FLAGS='-xcommon-avx512';; + SKL) + AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) + SIMD_FLAGS='-xcore-avx512';; KNC) AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner]) SIMD_FLAGS='';; From ce1fc1f48aac5a48512b6ff70c1d85d0810d7623 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 11 Mar 2021 22:20:53 +0100 Subject: [PATCH 155/201] Possible fallback plan for Fionn's compiler bbug in nvcc --- Grid/lattice/Lattice_where.h | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/Grid/lattice/Lattice_where.h b/Grid/lattice/Lattice_where.h index 6686d1b3..e9a3f823 100644 --- a/Grid/lattice/Lattice_where.h +++ b/Grid/lattice/Lattice_where.h @@ -43,7 +43,7 @@ inline void whereWolf(Lattice &ret,const Lattice &predicate,Lattice< conformable(iftrue,predicate); conformable(iftrue,ret); - GridBase *grid=iftrue._grid; + GridBase *grid=iftrue.Grid(); typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_type scalar_type; @@ -52,22 +52,24 @@ inline void whereWolf(Lattice &ret,const Lattice &predicate,Lattice< const int Nsimd = grid->Nsimd(); - std::vector mask(Nsimd); - std::vector truevals (Nsimd); - std::vector falsevals(Nsimd); + Integer mask; + scalar_object trueval; + scalar_object falseval; - parallel_for(int ss=0;ssoSites(); ss++){ - - extract(iftrue._odata[ss] ,truevals); - extract(iffalse._odata[ss] ,falsevals); - extract(TensorRemove(predicate._odata[ss]),mask); - - for(int s=0;soSites(); + thread_for(ss,NN,{ + for(int l=0;l @@ -76,9 +78,9 @@ inline Lattice whereWolf(const Lattice &predicate,Lattice &ift conformable(iftrue,iffalse); conformable(iftrue,predicate); - Lattice ret(iftrue._grid); + Lattice ret(iftrue.Grid()); - where(ret,predicate,iftrue,iffalse); + whereWolf(ret,predicate,iftrue,iffalse); return ret; } From cd5891eecd45b53db3a02498d97d0bd5e29c67eb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 11 Mar 2021 22:34:28 +0100 Subject: [PATCH 156/201] Test that fails on Cuda 11.0 --- tests/core/Test_where_extended.cc | 124 ++++++++++++++++-------------- 1 file changed, 65 insertions(+), 59 deletions(-) diff --git a/tests/core/Test_where_extended.cc b/tests/core/Test_where_extended.cc index 706fd7ee..9862b3ed 100644 --- a/tests/core/Test_where_extended.cc +++ b/tests/core/Test_where_extended.cc @@ -51,87 +51,93 @@ int main (int argc, char ** argv) } GridCartesian GRID(latt_size,simd_layout,mpi_layout); - - LatticeComplexD zz(&GRID); - LatticeInteger coor(&GRID); - LatticeComplexD rn(&GRID); - LatticeComplexD sl(&GRID); - - zz = ComplexD(0.0,0.0); - GridParallelRNG RNG(&GRID); RNG.SeedFixedIntegers(std::vector({45,12,81,9})); - gaussian(RNG,rn); - RealD nn=norm2(rn); - for(int mu=0;mu seeds4({1,2,3,4}); - GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); - std::cout<=tmin),tmpF,ZZF); - nA = nA + norm2(tmp2F); - InsertSlice(tmp2F, q_outF, s , 0); + gaussian(RNG,rn); + + RealD nn=norm2(rn); + for(int mu=0;mu=tmin),tmpP,ZZP); - nA = nA + norm2(tmp2P); - InsertSlice(tmp2P, q_outP, s , 0); + zz = ComplexD(0.0,0.0); + + gaussian(RNG,rn); + + RealD nn=norm2(rn); + for(int mu=0;mu Date: Thu, 11 Mar 2021 23:54:53 +0100 Subject: [PATCH 157/201] NVCC versions found buggy added as guard --- Grid/util/CompilerCompatible.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Grid/util/CompilerCompatible.h b/Grid/util/CompilerCompatible.h index 37331668..7c4a056d 100644 --- a/Grid/util/CompilerCompatible.h +++ b/Grid/util/CompilerCompatible.h @@ -1,5 +1,16 @@ #pragma once +#if defined(__NVCC__) + +#if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 0) +#error "NVCC version 11.0 breaks on Ampere, see Github issue 346" +#endif +#if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 1) +#error "NVCC version 11.1 breaks on Ampere, see Github issue 346" +#endif + +#endif + #if defined(__clang__) #if __clang_major__ < 3 From db3ac67506bcbdbe0799632dcc1561a96d0e320f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 12 Mar 2021 14:55:07 +0100 Subject: [PATCH 158/201] Update thread issue --- Grid/lattice/Lattice_where.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Grid/lattice/Lattice_where.h b/Grid/lattice/Lattice_where.h index e9a3f823..777f4015 100644 --- a/Grid/lattice/Lattice_where.h +++ b/Grid/lattice/Lattice_where.h @@ -52,16 +52,15 @@ inline void whereWolf(Lattice &ret,const Lattice &predicate,Lattice< const int Nsimd = grid->Nsimd(); - Integer mask; - scalar_object trueval; - scalar_object falseval; - autoView(iftrue_v,iftrue,CpuRead); autoView(iffalse_v,iffalse,CpuRead); autoView(predicate_v,predicate,CpuRead); autoView(ret_v,ret,CpuWrite); Integer NN= grid->oSites(); thread_for(ss,NN,{ + Integer mask; + scalar_object trueval; + scalar_object falseval; for(int l=0;l Date: Fri, 12 Mar 2021 09:31:17 -0500 Subject: [PATCH 159/201] updated to do list. Start adding DDHMC work items --- TODO | 75 ++++++++++++++++++++++++++++-------------------------------- 1 file changed, 35 insertions(+), 40 deletions(-) diff --git a/TODO b/TODO index f1175560..e23e040d 100644 --- a/TODO +++ b/TODO @@ -1,3 +1,6 @@ +-- comms threads issue?? +-- Part done: Staggered kernel performance on GPU + ========================================================= General ========================================================= @@ -5,28 +8,18 @@ General - Make representations code take Gimpl - Simplify the HMCand remove modules - Lattice_arith - are the mult, mac etc.. still needed after ET engine? -- Lattice_rng -- Lattice_transfer.h -- accelerate A2Autils -- off critical path for HMC +- Lattice_rng - faster local only loop in init +- Audit: accelerate A2Autils -- off critical path for HMC ========================================================= -GPU branch code item work list +GPU work list ========================================================= -* sum_cpu promote to double during summation for increased precisoin. +* sum_cpu promote to double during summation for increased precision. * Introduce sumD & ReduceD * GPU sum is probably better currently. - * Accelerate the cshift & benchmark -* 0) Single GPU -- 128 bit integer table load in GPU code. - - ImprovedStaggered accelerate & measure perf - - Gianluca's changes to Cayley into gpu-port - - Mobius kernel fusion. -- Gianluca? - - Lebesque order reintroduction. StencilView should have pointer to it - - Lebesgue reorder in all kernels - * 3) Comms/NVlink - OpenMP tasks to run comms threads. Experiment with it - Remove explicit openMP in staggered. @@ -35,14 +28,6 @@ GPU branch code item work list - Stencil gather ?? - SIMD dirs in stencil -* 4) ET enhancements -- eval -> scalar ops in ET engine -- coalescedRead, coalescedWrite in expressions. - -* 5) Misc -- Conserved current clean up. -- multLinkProp eliminate - 8) Merge develop and test HMC 9) Gamma tables on GPU; check this. Appear to work, but no idea why. Are these done on CPU? @@ -52,7 +37,7 @@ GPU branch code item work list - Audit NAMESPACE CHANGES - Audit changes ------ +--------- Gianluca's changes - Performance impact of construct in aligned allocator??? --------- @@ -62,6 +47,33 @@ Gianluca's changes ----------------------------- DONE: ----------------------------- +===== +-- Done: Remez X^-1/2 X^-1/2 X = 1 test. + Feed in MdagM^2 as a test and take its sqrt. + Automated test that MdagM invsqrt(MdagM)invsqrt(MdagM) = 1 in HMC for bounds satisfaction. + +-- Done: Sycl Kernels into develop. Compare to existing unroll and just use. +-- Done: sRNG into refresh functions +-- Done: Tuned decomposition on CUDA into develop +-- Done: Sycl friend accessor. Const view attempt via typedef?? + + +* Done 5) Misc +- Conserved current clean up. +- multLinkProp eliminate + +* Done 0) Single GPU +- 128 bit integer table load in GPU code. + - ImprovedStaggered accelerate & measure perf + - Gianluca's changes to Cayley into gpu-port + - Mobius kernel fusion. -- Gianluca? + - Lebesque order reintroduction. StencilView should have pointer to it + - Lebesgue reorder in all kernels + +* 4) ET enhancements +- Done eval -> scalar ops in ET engine +- Done coalescedRead, coalescedWrite in expressions. + ============================================================================================= AUDIT ContractWWVV with respect to develop -- DONE - GPU accelerate EOFA -- DONE @@ -125,23 +137,6 @@ AUDIT ContractWWVV with respect to develop -- DONE - - (4) omp parallel for collapse(n) - - Only (1) has a natural mirror in accelerator_loop - - Nested loop macros get cumbersome made a generic interface for N deep -- - Don't like thread_region and thread_loop_in_region -- - Could replace with - - thread_nested(1, - for { - - } - ); - thread_nested(2, - for (){ - for (){ - - } - } - ); - - and same "in_region". ----------------------------- From 51f506553c59923fc60f3a68333b14757b8853a5 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 12 Mar 2021 15:33:04 +0100 Subject: [PATCH 160/201] Read out the local ID once, and store --- .../WilsonKernelsHandImplementation.h | 165 ++++++++++-------- 1 file changed, 96 insertions(+), 69 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index fb42fe88..0703b613 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -79,18 +79,18 @@ Author: paboyle #ifdef GRID_SIMT #define LOAD_CHIMU(ptype) \ {const SiteSpinor & ref (in[offset]); \ - Chimu_00=coalescedReadPermute(ref()(0)(0),perm); \ - Chimu_01=coalescedReadPermute(ref()(0)(1),perm); \ - Chimu_02=coalescedReadPermute(ref()(0)(2),perm); \ - Chimu_10=coalescedReadPermute(ref()(1)(0),perm); \ - Chimu_11=coalescedReadPermute(ref()(1)(1),perm); \ - Chimu_12=coalescedReadPermute(ref()(1)(2),perm); \ - Chimu_20=coalescedReadPermute(ref()(2)(0),perm); \ - Chimu_21=coalescedReadPermute(ref()(2)(1),perm); \ - Chimu_22=coalescedReadPermute(ref()(2)(2),perm); \ - Chimu_30=coalescedReadPermute(ref()(3)(0),perm); \ - Chimu_31=coalescedReadPermute(ref()(3)(1),perm); \ - Chimu_32=coalescedReadPermute(ref()(3)(2),perm); } + Chimu_00=coalescedReadPermute(ref()(0)(0),perm,lane); \ + Chimu_01=coalescedReadPermute(ref()(0)(1),perm,lane); \ + Chimu_02=coalescedReadPermute(ref()(0)(2),perm,lane); \ + Chimu_10=coalescedReadPermute(ref()(1)(0),perm,lane); \ + Chimu_11=coalescedReadPermute(ref()(1)(1),perm,lane); \ + Chimu_12=coalescedReadPermute(ref()(1)(2),perm,lane); \ + Chimu_20=coalescedReadPermute(ref()(2)(0),perm,lane); \ + Chimu_21=coalescedReadPermute(ref()(2)(1),perm,lane); \ + Chimu_22=coalescedReadPermute(ref()(2)(2),perm,lane); \ + Chimu_30=coalescedReadPermute(ref()(3)(0),perm,lane); \ + Chimu_31=coalescedReadPermute(ref()(3)(1),perm,lane); \ + Chimu_32=coalescedReadPermute(ref()(3)(2),perm,lane); } #define PERMUTE_DIR(dir) ; #else #define LOAD_CHIMU(ptype) \ @@ -119,43 +119,43 @@ Author: paboyle #endif #define MULT_2SPIN(A)\ - {auto & ref(U[sU](A)); \ - U_00=coalescedRead(ref()(0,0)); \ - U_10=coalescedRead(ref()(1,0)); \ - U_20=coalescedRead(ref()(2,0)); \ - U_01=coalescedRead(ref()(0,1)); \ - U_11=coalescedRead(ref()(1,1)); \ - U_21=coalescedRead(ref()(2,1)); \ - UChi_00 = U_00*Chi_00; \ - UChi_10 = U_00*Chi_10; \ - UChi_01 = U_10*Chi_00; \ - UChi_11 = U_10*Chi_10; \ - UChi_02 = U_20*Chi_00; \ - UChi_12 = U_20*Chi_10; \ - UChi_00+= U_01*Chi_01; \ - UChi_10+= U_01*Chi_11; \ - UChi_01+= U_11*Chi_01; \ - UChi_11+= U_11*Chi_11; \ - UChi_02+= U_21*Chi_01; \ - UChi_12+= U_21*Chi_11; \ - U_00=coalescedRead(ref()(0,2)); \ - U_10=coalescedRead(ref()(1,2)); \ - U_20=coalescedRead(ref()(2,2)); \ - UChi_00+= U_00*Chi_02; \ - UChi_10+= U_00*Chi_12; \ - UChi_01+= U_10*Chi_02; \ - UChi_11+= U_10*Chi_12; \ - UChi_02+= U_20*Chi_02; \ + {auto & ref(U[sU](A)); \ + U_00=coalescedRead(ref()(0,0),lane); \ + U_10=coalescedRead(ref()(1,0),lane); \ + U_20=coalescedRead(ref()(2,0),lane); \ + U_01=coalescedRead(ref()(0,1),lane); \ + U_11=coalescedRead(ref()(1,1),lane); \ + U_21=coalescedRead(ref()(2,1),lane); \ + UChi_00 = U_00*Chi_00; \ + UChi_10 = U_00*Chi_10; \ + UChi_01 = U_10*Chi_00; \ + UChi_11 = U_10*Chi_10; \ + UChi_02 = U_20*Chi_00; \ + UChi_12 = U_20*Chi_10; \ + UChi_00+= U_01*Chi_01; \ + UChi_10+= U_01*Chi_11; \ + UChi_01+= U_11*Chi_01; \ + UChi_11+= U_11*Chi_11; \ + UChi_02+= U_21*Chi_01; \ + UChi_12+= U_21*Chi_11; \ + U_00=coalescedRead(ref()(0,2),lane); \ + U_10=coalescedRead(ref()(1,2),lane); \ + U_20=coalescedRead(ref()(2,2),lane); \ + UChi_00+= U_00*Chi_02; \ + UChi_10+= U_00*Chi_12; \ + UChi_01+= U_10*Chi_02; \ + UChi_11+= U_10*Chi_12; \ + UChi_02+= U_20*Chi_02; \ UChi_12+= U_20*Chi_12;} #define LOAD_CHI \ {const SiteHalfSpinor &ref(buf[offset]); \ - Chi_00 = coalescedRead(ref()(0)(0)); \ - Chi_01 = coalescedRead(ref()(0)(1)); \ - Chi_02 = coalescedRead(ref()(0)(2)); \ - Chi_10 = coalescedRead(ref()(1)(0)); \ - Chi_11 = coalescedRead(ref()(1)(1)); \ - Chi_12 = coalescedRead(ref()(1)(2));} + Chi_00 = coalescedRead(ref()(0)(0),lane); \ + Chi_01 = coalescedRead(ref()(0)(1),lane); \ + Chi_02 = coalescedRead(ref()(0)(2),lane); \ + Chi_10 = coalescedRead(ref()(1)(0),lane); \ + Chi_11 = coalescedRead(ref()(1)(1),lane); \ + Chi_12 = coalescedRead(ref()(1)(2),lane);} // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); @@ -453,35 +453,35 @@ Author: paboyle #define HAND_RESULT(ss) \ { \ SiteSpinor & ref (out[ss]); \ - coalescedWrite(ref()(0)(0),result_00); \ - coalescedWrite(ref()(0)(1),result_01); \ - coalescedWrite(ref()(0)(2),result_02); \ - coalescedWrite(ref()(1)(0),result_10); \ - coalescedWrite(ref()(1)(1),result_11); \ - coalescedWrite(ref()(1)(2),result_12); \ - coalescedWrite(ref()(2)(0),result_20); \ - coalescedWrite(ref()(2)(1),result_21); \ - coalescedWrite(ref()(2)(2),result_22); \ - coalescedWrite(ref()(3)(0),result_30); \ - coalescedWrite(ref()(3)(1),result_31); \ - coalescedWrite(ref()(3)(2),result_32); \ + coalescedWrite(ref()(0)(0),result_00,lane); \ + coalescedWrite(ref()(0)(1),result_01,lane); \ + coalescedWrite(ref()(0)(2),result_02,lane); \ + coalescedWrite(ref()(1)(0),result_10,lane); \ + coalescedWrite(ref()(1)(1),result_11,lane); \ + coalescedWrite(ref()(1)(2),result_12,lane); \ + coalescedWrite(ref()(2)(0),result_20,lane); \ + coalescedWrite(ref()(2)(1),result_21,lane); \ + coalescedWrite(ref()(2)(2),result_22,lane); \ + coalescedWrite(ref()(3)(0),result_30,lane); \ + coalescedWrite(ref()(3)(1),result_31,lane); \ + coalescedWrite(ref()(3)(2),result_32,lane); \ } #define HAND_RESULT_EXT(ss) \ { \ SiteSpinor & ref (out[ss]); \ - coalescedWrite(ref()(0)(0),coalescedRead(ref()(0)(0))+result_00); \ - coalescedWrite(ref()(0)(1),coalescedRead(ref()(0)(1))+result_01); \ - coalescedWrite(ref()(0)(2),coalescedRead(ref()(0)(2))+result_02); \ - coalescedWrite(ref()(1)(0),coalescedRead(ref()(1)(0))+result_10); \ - coalescedWrite(ref()(1)(1),coalescedRead(ref()(1)(1))+result_11); \ - coalescedWrite(ref()(1)(2),coalescedRead(ref()(1)(2))+result_12); \ - coalescedWrite(ref()(2)(0),coalescedRead(ref()(2)(0))+result_20); \ - coalescedWrite(ref()(2)(1),coalescedRead(ref()(2)(1))+result_21); \ - coalescedWrite(ref()(2)(2),coalescedRead(ref()(2)(2))+result_22); \ - coalescedWrite(ref()(3)(0),coalescedRead(ref()(3)(0))+result_30); \ - coalescedWrite(ref()(3)(1),coalescedRead(ref()(3)(1))+result_31); \ - coalescedWrite(ref()(3)(2),coalescedRead(ref()(3)(2))+result_32); \ + coalescedWrite(ref()(0)(0),coalescedRead(ref()(0)(0))+result_00,lane); \ + coalescedWrite(ref()(0)(1),coalescedRead(ref()(0)(1))+result_01,lane); \ + coalescedWrite(ref()(0)(2),coalescedRead(ref()(0)(2))+result_02,lane); \ + coalescedWrite(ref()(1)(0),coalescedRead(ref()(1)(0))+result_10,lane); \ + coalescedWrite(ref()(1)(1),coalescedRead(ref()(1)(1))+result_11,lane); \ + coalescedWrite(ref()(1)(2),coalescedRead(ref()(1)(2))+result_12,lane); \ + coalescedWrite(ref()(2)(0),coalescedRead(ref()(2)(0))+result_20,lane); \ + coalescedWrite(ref()(2)(1),coalescedRead(ref()(2)(1))+result_21,lane); \ + coalescedWrite(ref()(2)(2),coalescedRead(ref()(2)(2))+result_22,lane); \ + coalescedWrite(ref()(3)(0),coalescedRead(ref()(3)(0))+result_30,lane); \ + coalescedWrite(ref()(3)(1),coalescedRead(ref()(3)(1))+result_31,lane); \ + coalescedWrite(ref()(3)(2),coalescedRead(ref()(3)(2))+result_32,lane); \ } #define HAND_DECLARATIONS(Simd) \ @@ -558,6 +558,9 @@ WilsonKernels::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, // typedef decltype( coalescedRead( vCplx()()() )) Simt; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + HAND_DECLARATIONS(Simt); int offset,local,perm, ptype; @@ -584,6 +587,10 @@ WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + HAND_DECLARATIONS(Simt); int offset,local,perm, ptype; @@ -609,6 +616,10 @@ void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + HAND_DECLARATIONS(Simt); StencilEntry *SE; @@ -635,6 +646,10 @@ WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + HAND_DECLARATIONS(Simt); int offset,local,perm, ptype; @@ -660,6 +675,10 @@ void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + HAND_DECLARATIONS(Simt); StencilEntry *SE; @@ -686,6 +705,10 @@ WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + HAND_DECLARATIONS(Simt); int offset, ptype; @@ -712,6 +735,10 @@ void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldVi typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + HAND_DECLARATIONS(Simt); StencilEntry *SE; From 9c2b37218a8849ece7198fc7a25b56bde10f6b13 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 18 Mar 2021 06:24:11 -0400 Subject: [PATCH 161/201] sRNG parameter added --- Grid/qcd/utils/Metric.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Grid/qcd/utils/Metric.h b/Grid/qcd/utils/Metric.h index 10d06de8..d8ae27dc 100644 --- a/Grid/qcd/utils/Metric.h +++ b/Grid/qcd/utils/Metric.h @@ -93,13 +93,13 @@ public: GeneralisedMomenta(GridBase* grid, Metric& M): M(M), Mom(grid), AuxMom(grid), AuxField(grid){} // Correct - void MomentaDistribution(GridParallelRNG& pRNG){ + void MomentaDistribution(GridSerialRNG & sRNG, GridParallelRNG& pRNG){ // Generate a distribution for // P^dag G P // where G = M^-1 // Generate gaussian momenta - Implementation::generate_momenta(Mom, pRNG); + Implementation::generate_momenta(Mom, sRNG, pRNG); // Modify the distribution with the metric M.MSquareRoot(Mom); @@ -107,8 +107,8 @@ public: // Auxiliary momenta // do nothing if trivial, so hide in the metric MomentaField AuxMomTemp(Mom.Grid()); - Implementation::generate_momenta(AuxMom, pRNG); - Implementation::generate_momenta(AuxField, pRNG); + Implementation::generate_momenta(AuxMom, sRNG, pRNG); + Implementation::generate_momenta(AuxField, sRNG, pRNG); // Modify the distribution with the metric // Aux^dag M Aux M.MInvSquareRoot(AuxMom); // AuxMom = M^{-1/2} AuxMomTemp From 49b0af2c95bd078b6b0b40b8b28b2ea8309daf45 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 18 Mar 2021 09:10:02 -0400 Subject: [PATCH 162/201] Update of tests to compile with the sRNG addition. Audited the code conventions (again) with the CPS momentum denominator and added anti periodic in time to the Test_mobius_force.cc and tested the Test_dwf_gpforce. Promoted thesee to test full HMC hamiltonian, tr P^2/2 + phidag MdagM phi with the same pdot and Udot as audited in the Integrator.h etc... With full comments and sources for factors. --- tests/forces/Test_dwf_force_eofa.cc | 4 +- tests/forces/Test_dwf_gpforce.cc | 123 +++++++++++++++-------- tests/forces/Test_dwf_gpforce_eofa.cc | 3 +- tests/forces/Test_laplacian_force.cc | 3 +- tests/forces/Test_mobius_force.cc | 109 +++++++++++++++++--- tests/forces/Test_mobius_force_eofa.cc | 3 +- tests/forces/Test_mobius_gpforce_eofa.cc | 3 +- 7 files changed, 188 insertions(+), 60 deletions(-) diff --git a/tests/forces/Test_dwf_force_eofa.cc b/tests/forces/Test_dwf_force_eofa.cc index 80d36934..525178d0 100644 --- a/tests/forces/Test_dwf_force_eofa.cc +++ b/tests/forces/Test_dwf_force_eofa.cc @@ -86,7 +86,9 @@ int main (int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, true); - Meofa.refresh(U, RNG5); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4); + Meofa.refresh(U, sRNG, RNG5 ); + RealD S = Meofa.S(U); // pdag M p // get the deriv of phidag M phi with respect to "U" diff --git a/tests/forces/Test_dwf_gpforce.cc b/tests/forces/Test_dwf_gpforce.cc index 28133cc6..1fa1c6e4 100644 --- a/tests/forces/Test_dwf_gpforce.cc +++ b/tests/forces/Test_dwf_gpforce.cc @@ -84,6 +84,13 @@ int main (int argc, char ** argv) GparityDomainWallFermionR::ImplParams params; params.twists = twists; + /* + params.boundary_phases[0] = 1.0; + params.boundary_phases[1] = 1.0; + params.boundary_phases[2] = 1.0; + params.boundary_phases[3] =- 1.0; + */ + GparityDomainWallFermionR Dw(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params); Dw.M (phi,Mphi); @@ -96,6 +103,16 @@ int main (int argc, char ** argv) Dw.MDeriv(tmp , Mphi, phi,DaggerNo ); UdSdU=tmp; Dw.MDeriv(tmp , phi, Mphi,DaggerYes ); UdSdU=(UdSdU+tmp); + + // ***************************************************************************************** + // *** There is a funny negative sign in all derivatives. This is - UdSdU. *** + // *** *** + // *** Deriv in both Wilson gauge action and the TwoFlavour.h seems to miss a minus sign *** + // *** UdSdU is negated relative to what I think - call what is returned mUdSdU, *** + // *** and insert minus sign *** + // ***************************************************************************************** + + UdSdU = - UdSdU ; // Follow sign convention of actions in Grid. Seems crazy. FermionField Ftmp (FGrid); @@ -106,7 +123,7 @@ int main (int argc, char ** argv) RealD Hmom = 0.0; RealD Hmomprime = 0.0; LatticeColourMatrix mommu(UGrid); - LatticeColourMatrix forcemu(UGrid); + LatticeColourMatrix mUdSdUmu(UGrid); LatticeGaugeField mom(UGrid); LatticeGaugeField Uprime(UGrid); @@ -114,10 +131,20 @@ int main (int argc, char ** argv) SU::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg - Hmom -= real(sum(trace(mommu*mommu))); + // Momentum Hamiltonian is - trace(p^2)/HMC_MOM_DENOMINATOR + // + // Integrator.h: RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom // GaugeImplTypes.h: Hloc += trace(Pmu * Pmu); + // Sign comes from a sneaky multiply by "i" in GaussianFundemantalLie algebra + // P is i P^a_\mu T^a, not Pa Ta + // + // Integrator.h: H = Hmom + sum S(action) + Hmom -= real(sum(trace(mommu*mommu)))/ HMC_MOMENTUM_DENOMINATOR; PokeIndex(mom,mommu,mu); + // -- Drops factor of "i" in the U update: U' = e^{P dt} U [ _not_ e^{iPdt}U ]. P is anti hermitian already + // -- Udot = p U + // fourth order exponential approx autoView( mom_v, mom, CpuRead); autoView( U_v , U, CpuRead); @@ -134,8 +161,8 @@ int main (int argc, char ** argv) ; }); } - std::cout << GridLogMessage <<"Initial mom hamiltonian is "<< Hmom <(mom,mu); - std::cout << GridLogMessage<< " Mommu " << norm2(mommu)<(UdSdU,mu); - std::cout << GridLogMessage<< " dsdumu " << norm2(mommu)<(UdSdU,mu); - mommu=Ta(mommu)*2.0; + mommu=Ta(mommu); // projectForce , GaugeImplTypes.h PokeIndex(UdSdU,mommu,mu); } for(int mu=0;mu(mom,mu); - std::cout << GridLogMessage<< " Mommu " << norm2(mommu)<(UdSdU,mu); - std::cout << GridLogMessage<< " dsdumu " << norm2(mommu)<(UdSdU,mu); + mUdSdUmu= PeekIndex(UdSdU,mu); mommu = PeekIndex(mom,mu); - // Update PF action density - dS = dS+trace(mommu*forcemu)*dt; + // + // Derive HMC eom: + // + // Sdot = - 2 trace( p p^dot ) / D - trace( p [ mUdSdU - h.c. ] ) = 0 + // + // + // Sdot = 0 = - 2 trace( p p^dot ) / D - 2 trace( p Ta( mUdSdU ) = 0 + // + // EOM: + // + // pdot = - D Ta( mUdSdU ) -- source of sign is the "funny sign" above + // + // dSqcd_dt = - 2.0*trace(mommu* Ta(mUdSdU) )*dt -- i.e. mUdSdU with adjoint term -> force has a 2x implicit + // + // dH_mom/dt = - 2 trace (p pdot)/Denom + // + // dH_tot / dt = 0 <= pdot = - Denom * mUdSdU + // + // dH_mom/dt = 2 trace (p mUdSdU ) + // + // True Momentum delta H has a dt^2 piece + // + // dSmom = [ trace mom*mom - trace ( (mom-Denom*f*dt)(mom-Denom*f*dt) ) ] / Denom + // = 2*trace(mom*f) dt - Denom*dt*dt * trace(f*f). + // = dSmom + dSmom2 + // - dSmom = dSmom - trace(mommu*forcemu) * dt; - dSmom2 = dSmom2 - trace(forcemu*forcemu) *(0.25* dt*dt); + dS = dS - 2.0*trace(mommu*mUdSdUmu)*dt; // U and Udagger derivs hence 2x. - // Update mom action density - mommu = mommu + forcemu*(dt*0.5); + dSmom = dSmom + 2.0*trace(mommu*mUdSdUmu) * dt; // this 2.0 coms from derivative of p^2 + + dSmom2 = dSmom2 - trace(mUdSdUmu*mUdSdUmu) * dt*dt* HMC_MOMENTUM_DENOMINATOR; // Remnant - Hmomprime -= real(sum(trace(mommu*mommu))); + // Update mom action density . Verbatim update_P in Integrator.h + mommu = mommu - mUdSdUmu * dt* HMC_MOMENTUM_DENOMINATOR;; + + Hmomprime -= real(sum(trace(mommu*mommu))) / HMC_MOMENTUM_DENOMINATOR; } @@ -199,20 +233,25 @@ int main (int argc, char ** argv) ComplexD dSm = sum(dSmom); ComplexD dSm2 = sum(dSmom2); + std::cout << GridLogMessage <<"dSm "<< dSm< CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, true); - Meofa.refresh(U, RNG5); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4); + Meofa.refresh(U, sRNG, RNG5); RealD S = Meofa.S(U); // pdag M p // get the deriv of phidag M phi with respect to "U" diff --git a/tests/forces/Test_laplacian_force.cc b/tests/forces/Test_laplacian_force.cc index 18508860..dbaf1cbd 100644 --- a/tests/forces/Test_laplacian_force.cc +++ b/tests/forces/Test_laplacian_force.cc @@ -46,6 +46,7 @@ int main (int argc, char ** argv) std::vector seeds({1,2,3,4}); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers({4,5,6,7}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({15,91,21,3})); @@ -67,7 +68,7 @@ int main (int argc, char ** argv) LaplacianAdjointField Laplacian(&Grid, CG, LapPar, Kappa); GeneralisedMomenta LaplacianMomenta(&Grid, Laplacian); LaplacianMomenta.M.ImportGauge(U); - LaplacianMomenta.MomentaDistribution(pRNG);// fills the Momenta with the correct distr + LaplacianMomenta.MomentaDistribution(sRNG,pRNG);// fills the Momenta with the correct distr std::cout << std::setprecision(15); diff --git a/tests/forces/Test_mobius_force.cc b/tests/forces/Test_mobius_force.cc index ba7bc363..d2326a81 100644 --- a/tests/forces/Test_mobius_force.cc +++ b/tests/forces/Test_mobius_force.cc @@ -69,7 +69,14 @@ int main (int argc, char ** argv) RealD M5=1.8; RealD b=0.5; RealD c=0.5; - MobiusFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c); + + WilsonImplParams p; + p.boundary_phases[0] = 1.0; + p.boundary_phases[1] = 1.0; + p.boundary_phases[2] = 1.0; + p.boundary_phases[3] =- 1.0; + + MobiusFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,p); Ddwf.M (phi,Mphi); ComplexD S = innerProduct(Mphi,Mphi); // pdag MdagM p @@ -82,24 +89,44 @@ int main (int argc, char ** argv) Ddwf.MDeriv(tmp , Mphi, phi,DaggerNo ); UdSdU=tmp; Ddwf.MDeriv(tmp , phi, Mphi,DaggerYes ); UdSdU=(UdSdU+tmp); + // ***************************************************************************************** + // *** There is a funny negative sign in all derivatives. This is - UdSdU. *** + // *** *** + // *** Deriv in both Wilson gauge action and the TwoFlavour.h seems to miss a minus sign *** + // *** UdSdU is negated relative to what I think - call what is returned mUdSdU, *** + // *** and insert minus sign *** + // ***************************************************************************************** + + UdSdU = - UdSdU ; // Follow sign convention of actions in Grid. Seems crazy. + LatticeFermion Ftmp (FGrid); //////////////////////////////////// // Modify the gauge field a little //////////////////////////////////// - RealD dt = 0.0001; + RealD dt = 0.001; + RealD Hmom = 0.0; + RealD Hmomprime = 0.0; LatticeColourMatrix mommu(UGrid); - LatticeColourMatrix forcemu(UGrid); + LatticeColourMatrix mUdSdUmu(UGrid); LatticeGaugeField mom(UGrid); LatticeGaugeField Uprime(UGrid); for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg - PokeIndex(mom,mommu,mu); + // Momentum Hamiltonian is - trace(p^2)/HMC_MOM_DENOMINATOR + // + // Integrator.h: RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom // GaugeImplTypes.h: Hloc += trace(Pmu * Pmu); + // Sign comes from a sneaky multiply by "i" in GaussianFundemantalLie algebra + // P is i P^a_\mu T^a, not Pa Ta + // + // Integrator.h: H = Hmom + sum S(action) + Hmom -= real(sum(trace(mommu*mommu)))/ HMC_MOMENTUM_DENOMINATOR; + // fourth order exponential approx autoView( U_v , U, CpuRead); autoView( mom_v, mom, CpuRead); @@ -115,6 +142,7 @@ int main (int argc, char ** argv) ; }); } + std::cout << GridLogMessage <<"Initial mom hamiltonian is "<< Hmom <(UdSdU,mu); - mommu=Ta(mommu)*2.0; + mommu=Ta(mommu); PokeIndex(UdSdU,mommu,mu); } for(int mu=0;mu(UdSdU,mu); + + mUdSdUmu= PeekIndex(UdSdU,mu); mommu = PeekIndex(mom,mu); - // Update PF action density - dS = dS+trace(mommu*forcemu)*dt; + // + // Derive HMC eom: + // + // Sdot = - 2 trace( p p^dot ) / D - trace( p [ mUdSdU - h.c. ] ) = 0 + // + // + // Sdot = 0 = - 2 trace( p p^dot ) / D - 2 trace( p Ta( mUdSdU ) = 0 + // + // EOM: + // + // pdot = - D Ta( mUdSdU ) -- source of sign is the "funny sign" above + // + // dSqcd_dt = - 2.0*trace(mommu* Ta(mUdSdU) )*dt -- i.e. mUdSdU with adjoint term -> force has a 2x implicit + // + // dH_mom/dt = - 2 trace (p pdot)/Denom + // + // dH_tot / dt = 0 <= pdot = - Denom * mUdSdU + // + // dH_mom/dt = 2 trace (p mUdSdU ) + // + // True Momentum delta H has a dt^2 piece + // + // dSmom = [ trace mom*mom - trace ( (mom-Denom*f*dt)(mom-Denom*f*dt) ) ] / Denom + // = 2*trace(mom*f) dt - Denom*dt*dt * trace(f*f). + // = dSmom + dSmom2 + // + + dS = dS - 2.0*trace(mommu*mUdSdUmu)*dt; // U and Udagger derivs hence 2x. + + dSmom = dSmom + 2.0*trace(mommu*mUdSdUmu) * dt; // this 2.0 coms from derivative of p^2 + + dSmom2 = dSmom2 - trace(mUdSdUmu*mUdSdUmu) * dt*dt* HMC_MOMENTUM_DENOMINATOR; // Remnant + + mommu = mommu - mUdSdUmu * dt* HMC_MOMENTUM_DENOMINATOR;; + + Hmomprime -= real(sum(trace(mommu*mommu))) / HMC_MOMENTUM_DENOMINATOR; } ComplexD dSpred = sum(dS); + ComplexD dSm = sum(dSmom); + ComplexD dSm2 = sum(dSmom2); - std::cout << GridLogMessage << " -- S "< CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, false); - Meofa.refresh(U, RNG5); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4); + Meofa.refresh(U, sRNG, RNG5 ); RealD S = Meofa.S(U); // pdag M p // get the deriv of phidag M phi with respect to "U" diff --git a/tests/forces/Test_mobius_gpforce_eofa.cc b/tests/forces/Test_mobius_gpforce_eofa.cc index 9c80b2aa..7f114615 100644 --- a/tests/forces/Test_mobius_gpforce_eofa.cc +++ b/tests/forces/Test_mobius_gpforce_eofa.cc @@ -93,7 +93,8 @@ int main (int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, false); - Meofa.refresh(U, RNG5); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4); + Meofa.refresh(U, sRNG, RNG5 ); RealD S = Meofa.S(U); // pdag M p // get the deriv of phidag M phi with respect to "U" From 15c50a7442527a7962e53cdc4a6bb3369f41501c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 18 Mar 2021 15:40:42 -0400 Subject: [PATCH 163/201] Explicit instantiate the template function --- Grid/util/Init.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 55d8c5bf..bfbc464d 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -153,6 +153,9 @@ void GridCmdOptionIntVector(const std::string &str,VectorInt & vec) return; } +template void GridCmdOptionIntVector(const std::string &str,std::vector & vec); +template void GridCmdOptionIntVector(const std::string &str,Coordinate & vec); + void GridCmdOptionInt(std::string &str,int & val) { std::stringstream ss(str); From 8bdadbadaca9cffd8d7616cc9deed07187b262f1 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 18 Mar 2021 15:41:14 -0400 Subject: [PATCH 164/201] Cold start --- HMC/Mobius2p1fRHMC.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/HMC/Mobius2p1fRHMC.cc b/HMC/Mobius2p1fRHMC.cc index 82ca4d37..b958d548 100644 --- a/HMC/Mobius2p1fRHMC.cc +++ b/HMC/Mobius2p1fRHMC.cc @@ -56,12 +56,12 @@ int main(int argc, char **argv) { MD.trajL = 1.0; HMCparameters HMCparams; - HMCparams.StartTrajectory = 30; + HMCparams.StartTrajectory = 0; HMCparams.Trajectories = 200; HMCparams.NoMetropolisUntil= 0; // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; - // HMCparams.StartingType =std::string("ColdStart"); - HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.StartingType =std::string("ColdStart"); + // HMCparams.StartingType =std::string("CheckpointStart"); HMCparams.MD = MD; HMCWrapper TheHMC(HMCparams); From bb89a82a07be478d50060efd5c76cf71622af870 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 29 Mar 2021 20:01:15 +0200 Subject: [PATCH 165/201] Staggered coalseced read --- Grid/qcd/action/fermion/StaggeredImpl.h | 16 ++-- Grid/qcd/action/fermion/WilsonImpl.h | 35 ++++++++- .../StaggeredKernelsImplementation.h | 76 ++++++++++--------- 3 files changed, 82 insertions(+), 45 deletions(-) diff --git a/Grid/qcd/action/fermion/StaggeredImpl.h b/Grid/qcd/action/fermion/StaggeredImpl.h index 8adf45a4..f44d12f4 100644 --- a/Grid/qcd/action/fermion/StaggeredImpl.h +++ b/Grid/qcd/action/fermion/StaggeredImpl.h @@ -72,19 +72,23 @@ public: StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){}; - static accelerator_inline void multLink(SiteSpinor &phi, + template + static accelerator_inline void multLink(_Spinor &phi, const SiteDoubledGaugeField &U, - const SiteSpinor &chi, + const _Spinor &chi, int mu) { - mult(&phi(), &U(mu), &chi()); + auto UU = coalescedRead(U(mu)); + mult(&phi(), &UU, &chi()); } - static accelerator_inline void multLinkAdd(SiteSpinor &phi, + template + static accelerator_inline void multLinkAdd(_Spinor &phi, const SiteDoubledGaugeField &U, - const SiteSpinor &chi, + const _Spinor &chi, int mu) { - mac(&phi(), &U(mu), &chi()); + auto UU = coalescedRead(U(mu)); + mac(&phi(), &UU, &chi()); } template diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index 94676b6b..2ff6feba 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -184,18 +184,22 @@ public: mat = TraceIndex(P); } - inline void extractLinkField(std::vector &mat, DoubledGaugeField &Uds){ + inline void extractLinkField(std::vector &mat, DoubledGaugeField &Uds) + { for (int mu = 0; mu < Nd; mu++) mat[mu] = PeekIndex(Uds, mu); } - - inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){ - + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu) + { +#undef USE_OLD_INSERT_FORCE int Ls=Btilde.Grid()->_fdimensions[0]; + autoView( mat_v , mat, AcceleratorWrite); +#ifdef USE_OLD_INSERT_FORCE GaugeLinkField tmp(mat.Grid()); tmp = Zero(); { + const int Nsimd = SiteSpinor::Nsimd(); autoView( tmp_v , tmp, AcceleratorWrite); autoView( Btilde_v , Btilde, AcceleratorRead); autoView( Atilde_v , Atilde, AcceleratorRead); @@ -208,6 +212,29 @@ public: }); } PokeIndex(mat,tmp,mu); +#else + { + const int Nsimd = SiteSpinor::Nsimd(); + autoView( Btilde_v , Btilde, AcceleratorRead); + autoView( Atilde_v , Atilde, AcceleratorRead); + accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{ + int sU=sss; + typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType; + ColorMatrixType sum; + zeroit(sum); + for(int s=0;s_is_local ) { \ - if (SE->_permute) { \ - chi_p = χ \ - permute(chi, in[SE->_offset], ptype); \ - } else { \ - chi_p = &in[SE->_offset]; \ - } \ + int perm= SE->_permute; \ + chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\ } else { \ - chi_p = &buf[SE->_offset]; \ + chi = coalescedRead(buf[SE->_offset],lane); \ } \ - multLink(Uchi, U[sU], *chi_p, Dir); + acceleratorSynchronise(); \ + multLink(Uchi, U[sU], chi, Dir); #define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \ SE = st.GetEntry(ptype, Dir+skew, sF); \ if (SE->_is_local ) { \ - if (SE->_permute) { \ - chi_p = χ \ - permute(chi, in[SE->_offset], ptype); \ - } else { \ - chi_p = &in[SE->_offset]; \ - } \ + int perm= SE->_permute; \ + chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\ } else if ( st.same_node[Dir] ) { \ - chi_p = &buf[SE->_offset]; \ + chi = coalescedRead(buf[SE->_offset],lane); \ } \ if (SE->_is_local || st.same_node[Dir] ) { \ - multLink(Uchi, U[sU], *chi_p, Dir); \ + multLink(Uchi, U[sU], chi, Dir); \ } #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \ SE = st.GetEntry(ptype, Dir+skew, sF); \ if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ nmu++; \ - chi_p = &buf[SE->_offset]; \ - multLink(Uchi, U[sU], *chi_p, Dir); \ + chi = coalescedRead(buf[SE->_offset],lane); \ + multLink(Uchi, U[sU], chi, Dir); \ } template @@ -84,12 +77,14 @@ void StaggeredKernels::DhopSiteGeneric(StencilView &st, SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dag) { - const SiteSpinor *chi_p; - SiteSpinor chi; - SiteSpinor Uchi; + typedef decltype(coalescedRead(in[0])) calcSpinor; + calcSpinor chi; + calcSpinor Uchi; StencilEntry *SE; int ptype; int skew; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); // for(int s=0;s::DhopSiteGeneric(StencilView &st, if ( dag ) { Uchi = - Uchi; } - vstream(out[sF], Uchi); + coalescedWrite(out[sF], Uchi,lane); } }; @@ -130,13 +125,16 @@ template accelerator_inline void StaggeredKernels::DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, - const FermionFieldView &in, FermionFieldView &out,int dag) { - const SiteSpinor *chi_p; - SiteSpinor chi; - SiteSpinor Uchi; + const FermionFieldView &in, FermionFieldView &out,int dag) +{ + typedef decltype(coalescedRead(in[0])) calcSpinor; + calcSpinor chi; + calcSpinor Uchi; StencilEntry *SE; int ptype; int skew ; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); // for(int s=0;s::DhopSiteGenericInt(StencilView &st, if ( dag ) { Uchi = - Uchi; } - vstream(out[sF], Uchi); + coalescedWrite(out[sF], Uchi,lane); } }; @@ -178,14 +176,17 @@ template accelerator_inline void StaggeredKernels::DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, - const FermionFieldView &in, FermionFieldView &out,int dag) { - const SiteSpinor *chi_p; - // SiteSpinor chi; - SiteSpinor Uchi; + const FermionFieldView &in, FermionFieldView &out,int dag) +{ + typedef decltype(coalescedRead(in[0])) calcSpinor; + calcSpinor chi; + calcSpinor Uchi; StencilEntry *SE; int ptype; int nmu=0; int skew ; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); // for(int s=0;s::DhopSiteGenericExt(StencilView &st, GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd); } - if ( nmu ) { - if ( dag ) { - out[sF] = out[sF] - Uchi; + if ( nmu ) { + auto _out = coalescedRead(out[sF],lane); + if ( dag ) { + coalescedWrite(out[sF], _out-Uchi,lane); } else { - out[sF] = out[sF] + Uchi; + coalescedWrite(out[sF], _out+Uchi,lane); } } } @@ -261,6 +263,8 @@ void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, GridBase *FGrid=in.Grid(); GridBase *UGrid=U.Grid(); typedef StaggeredKernels ThisKernel; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); autoView( UUU_v , UUU, AcceleratorRead); autoView( U_v , U, AcceleratorRead); autoView( in_v , in, AcceleratorRead); @@ -301,6 +305,8 @@ void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, GridBase *FGrid=in.Grid(); GridBase *UGrid=U.Grid(); typedef StaggeredKernels ThisKernel; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); autoView( UUU_v , U, AcceleratorRead); autoView( U_v , U, AcceleratorRead); autoView( in_v , in, AcceleratorRead); From e9479929570e3b6e000f60b6368d7981666fac7c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 29 Mar 2021 20:04:06 +0200 Subject: [PATCH 166/201] Improved force terms --- Grid/tensors/Tensor_outer.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/Grid/tensors/Tensor_outer.h b/Grid/tensors/Tensor_outer.h index 4902c22f..a32a2a91 100644 --- a/Grid/tensors/Tensor_outer.h +++ b/Grid/tensors/Tensor_outer.h @@ -34,6 +34,16 @@ NAMESPACE_BEGIN(Grid); // outerProduct Scalar x Scalar -> Scalar // Vector x Vector -> Matrix /////////////////////////////////////////////////////////////////////////////////////// +template = 0> +accelerator_inline CC outerProduct(const CC &l, const CC& r) +{ + return l*conj(r); +} +template = 0> +accelerator_inline RR outerProduct(const RR &l, const RR& r) +{ + return l*r; +} template accelerator_inline auto outerProduct (const iVector& lhs,const iVector& rhs) -> iMatrix @@ -57,17 +67,6 @@ auto outerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar = 0> -accelerator_inline CC outerProduct(const CC &l, const CC& r) -{ - return l*conj(r); -} -template = 0> -accelerator_inline RR outerProduct(const RR &l, const RR& r) -{ - return l*r; -} - NAMESPACE_END(Grid); #endif From a7fb25adf66996f5de6861228774c803ef87bd4a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 29 Mar 2021 21:44:14 +0200 Subject: [PATCH 167/201] Make Cshift fields static to avoid repeated reallocaate overhead --- Grid/cshift/Cshift_mpi.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Grid/cshift/Cshift_mpi.h b/Grid/cshift/Cshift_mpi.h index 375d004e..7e93e260 100644 --- a/Grid/cshift/Cshift_mpi.h +++ b/Grid/cshift/Cshift_mpi.h @@ -122,8 +122,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - cshiftVector send_buf(buffer_size); - cshiftVector recv_buf(buffer_size); + static cshiftVector send_buf; send_buf.resize(buffer_size); + static cshiftVector recv_buf; recv_buf.resize(buffer_size); int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); @@ -198,8 +198,8 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - std::vector > send_buf_extract(Nsimd); - std::vector > recv_buf_extract(Nsimd); + static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); + static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); scalar_object * recv_buf_extract_mpi; scalar_object * send_buf_extract_mpi; @@ -294,8 +294,8 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - cshiftVector send_buf_v(buffer_size); - cshiftVector recv_buf_v(buffer_size); + static cshiftVector send_buf_v; send_buf_v.resize(buffer_size); + static cshiftVector recv_buf_v; recv_buf_v.resize(buffer_size); vobj *send_buf; vobj *recv_buf; { @@ -381,8 +381,8 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - std::vector > send_buf_extract(Nsimd); - std::vector > recv_buf_extract(Nsimd); + static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); + static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); scalar_object * recv_buf_extract_mpi; scalar_object * send_buf_extract_mpi; { From addeb621a7015cb690a08d4650e8f2b10568fbf7 Mon Sep 17 00:00:00 2001 From: Andrew Zhen Ning Yong Date: Tue, 6 Apr 2021 13:45:37 +0100 Subject: [PATCH 168/201] Implemented tadpole operator for Shamir action. --- .../implementation/CayleyFermion5DImplementation.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index f11e9c44..c3e0f821 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -880,11 +880,23 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, } std::vector G_s(Ls,1.0); + Integer sign = 1; // sign flip for vector/tadpole if ( curr_type == Current::Axial ) { for(int s=0;s_b; + auto c=this->_c; + if ( b == 1 && c == 0 ) { + sign = -1; + } + else { + std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl; + assert(b==1 && c==0); + } + } for(int s=0;s::SeqConservedCurrent(PropagatorField &q_in, tmp = Cshift(tmp,mu,1); Impl::multLinkField(Utmp,this->Umu,tmp,mu); - tmp = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop + tmp = sign*G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop tmp = where((lcoor>=tmin),tmp,zz); // Mask the time L_Q = where((lcoor<=tmax),tmp,zz); // Position of current complicated From 980e721f6e25864cadfea3611a9c1a052d8d05c4 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 13 Apr 2021 09:33:01 -0400 Subject: [PATCH 169/201] Update MetaData.h --- Grid/parallelIO/MetaData.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/parallelIO/MetaData.h b/Grid/parallelIO/MetaData.h index d30ba523..af8b3f76 100644 --- a/Grid/parallelIO/MetaData.h +++ b/Grid/parallelIO/MetaData.h @@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header) std::time_t t = std::time(nullptr); std::tm tm_ = *std::localtime(&t); std::ostringstream oss; - // oss << std::put_time(&tm_, "%c %Z"); + oss << std::put_time(&tm_, "%c %Z"); header.creation_date = oss.str(); header.archive_date = header.creation_date; From 86e11743ca3bb52ebfbedce8584b95b7be6f7fe0 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 20 Apr 2021 10:19:11 -0400 Subject: [PATCH 170/201] set twists --- tests/forces/Test_gp_rect_force.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/forces/Test_gp_rect_force.cc b/tests/forces/Test_gp_rect_force.cc index 98ebb2fa..e277ea6b 100644 --- a/tests/forces/Test_gp_rect_force.cc +++ b/tests/forces/Test_gp_rect_force.cc @@ -29,7 +29,6 @@ Author: paboyle using namespace std; using namespace Grid; - ; @@ -59,6 +58,10 @@ int main (int argc, char ** argv) double beta = 1.0; double c1 = 0.331; + const int nu = 1; + std::vector twists(Nd,0); + twists[nu] = 1; + ConjugateGimplD::setDirections(twists); ConjugatePlaqPlusRectangleActionR Action(beta,c1); //ConjugateWilsonGaugeActionR Action(beta); //WilsonGaugeActionR Action(beta); From dbe210dd53405303cf57374c9b658902cbf8072a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 25 Apr 2021 10:25:59 -0400 Subject: [PATCH 171/201] Open the ens_id --- Grid/parallelIO/NerscIO.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/Grid/parallelIO/NerscIO.h b/Grid/parallelIO/NerscIO.h index 3ebdf0cc..1ffda074 100644 --- a/Grid/parallelIO/NerscIO.h +++ b/Grid/parallelIO/NerscIO.h @@ -205,11 +205,22 @@ public: std::cout< + static inline void writeConfiguration(Lattice &Umu, + std::string file, + std::string ens_id = std::string("UKQCD"), + std::string ens_label = std::string("DWF")) + { + writeConfiguration(Umu,file,0,1,ens_id,ens_label); + } template static inline void writeConfiguration(Lattice &Umu, std::string file, int two_row, - int bits32) + int bits32, + std::string ens_id = std::string("UKQCD"), + std::string ens_label = std::string("DWF")) { typedef vLorentzColourMatrixD vobj; typedef typename vobj::scalar_object sobj; @@ -219,8 +230,8 @@ public: // Following should become arguments /////////////////////////////////////////// header.sequence_number = 1; - header.ensemble_id = "UKQCD"; - header.ensemble_label = "DWF"; + header.ensemble_id = ens_id; + header.ensemble_label = ens_label; typedef LorentzColourMatrixD fobj3D; typedef LorentzColour2x3D fobj2D; @@ -232,7 +243,7 @@ public: GaugeStats Stats; Stats(Umu,header); MachineCharacteristics(header); - uint64_t offset; + uint64_t offset; // Sod it -- always write 3x3 double header.floating_point = std::string("IEEE64BIG"); From 955a8113ded53fc55c204fc107832933ad120c2c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 25 Apr 2021 10:36:38 -0400 Subject: [PATCH 172/201] Expose label only to reduce number of parameters --- Grid/parallelIO/NerscIO.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Grid/parallelIO/NerscIO.h b/Grid/parallelIO/NerscIO.h index 1ffda074..04aae5d8 100644 --- a/Grid/parallelIO/NerscIO.h +++ b/Grid/parallelIO/NerscIO.h @@ -209,7 +209,6 @@ public: template static inline void writeConfiguration(Lattice &Umu, std::string file, - std::string ens_id = std::string("UKQCD"), std::string ens_label = std::string("DWF")) { writeConfiguration(Umu,file,0,1,ens_id,ens_label); @@ -219,7 +218,6 @@ public: std::string file, int two_row, int bits32, - std::string ens_id = std::string("UKQCD"), std::string ens_label = std::string("DWF")) { typedef vLorentzColourMatrixD vobj; @@ -230,7 +228,7 @@ public: // Following should become arguments /////////////////////////////////////////// header.sequence_number = 1; - header.ensemble_id = ens_id; + header.ensemble_id = std::string("UKQCD"); header.ensemble_label = ens_label; typedef LorentzColourMatrixD fobj3D; From d45c868656b4e28452946f90a6e71cdf12c21cf2 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 25 Apr 2021 10:53:34 -0400 Subject: [PATCH 173/201] Change interface --- Grid/parallelIO/NerscIO.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/parallelIO/NerscIO.h b/Grid/parallelIO/NerscIO.h index 04aae5d8..99011e25 100644 --- a/Grid/parallelIO/NerscIO.h +++ b/Grid/parallelIO/NerscIO.h @@ -211,7 +211,7 @@ public: std::string file, std::string ens_label = std::string("DWF")) { - writeConfiguration(Umu,file,0,1,ens_id,ens_label); + writeConfiguration(Umu,file,0,1,ens_label); } template static inline void writeConfiguration(Lattice &Umu, From 8cd4263974060f9af3b002604d9036e2552cc307 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 25 Apr 2021 22:20:37 -0400 Subject: [PATCH 174/201] Tests compile --- tests/debug/Test_heatbath_dwf_eofa.cc | 6 ++++-- tests/debug/Test_heatbath_dwf_eofa_gparity.cc | 7 +++++-- tests/debug/Test_heatbath_mobius_eofa.cc | 6 ++++-- tests/debug/Test_heatbath_mobius_eofa_gparity.cc | 6 ++++-- tests/forces/Test_momentum_filter.cc | 4 +++- 5 files changed, 20 insertions(+), 9 deletions(-) diff --git a/tests/debug/Test_heatbath_dwf_eofa.cc b/tests/debug/Test_heatbath_dwf_eofa.cc index 9d453a96..e1c18021 100644 --- a/tests/debug/Test_heatbath_dwf_eofa.cc +++ b/tests/debug/Test_heatbath_dwf_eofa.cc @@ -66,7 +66,9 @@ int main(int argc, char** argv) // Set up RNGs std::vector seeds4({1, 2, 3, 4}); std::vector seeds5({5, 6, 7, 8}); + GridSerialRNG sRNG; GridParallelRNG RNG5(FGrid); + sRNG.SeedFixedIntegers(seeds5); RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); @@ -84,7 +86,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, false); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu,sRNG, RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } @@ -94,7 +96,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, true); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu,sRNG, RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } diff --git a/tests/debug/Test_heatbath_dwf_eofa_gparity.cc b/tests/debug/Test_heatbath_dwf_eofa_gparity.cc index 22cc1e90..7eabfc65 100644 --- a/tests/debug/Test_heatbath_dwf_eofa_gparity.cc +++ b/tests/debug/Test_heatbath_dwf_eofa_gparity.cc @@ -74,6 +74,9 @@ int main(int argc, char** argv) RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridSerialRNG sRNG; + RNG4.SeedFixedIntegers(seeds4); + sRNG.SeedFixedIntegers(seeds5); // Random gauge field LatticeGaugeField Umu(UGrid); @@ -90,7 +93,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, false); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu,sRNG, RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } @@ -100,7 +103,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, true); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu,sRNG, RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } diff --git a/tests/debug/Test_heatbath_mobius_eofa.cc b/tests/debug/Test_heatbath_mobius_eofa.cc index 4cf4bf53..48806642 100644 --- a/tests/debug/Test_heatbath_mobius_eofa.cc +++ b/tests/debug/Test_heatbath_mobius_eofa.cc @@ -68,8 +68,10 @@ int main(int argc, char** argv) // Set up RNGs std::vector seeds4({1, 2, 3, 4}); std::vector seeds5({5, 6, 7, 8}); + GridSerialRNG sRNG; GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + sRNG.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); @@ -86,7 +88,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, false); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu, sRNG,RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } @@ -96,7 +98,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, true); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu, sRNG,RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } diff --git a/tests/debug/Test_heatbath_mobius_eofa_gparity.cc b/tests/debug/Test_heatbath_mobius_eofa_gparity.cc index 2fcb4b9f..52447e5e 100644 --- a/tests/debug/Test_heatbath_mobius_eofa_gparity.cc +++ b/tests/debug/Test_heatbath_mobius_eofa_gparity.cc @@ -73,7 +73,9 @@ int main(int argc, char** argv) std::vector seeds4({1, 2, 3, 4}); std::vector seeds5({5, 6, 7, 8}); GridParallelRNG RNG5(FGrid); + GridSerialRNG sRNG; RNG5.SeedFixedIntegers(seeds5); + sRNG.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); @@ -91,7 +93,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, false); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu, sRNG, RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } @@ -101,7 +103,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, true); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu, sRNG, RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } diff --git a/tests/forces/Test_momentum_filter.cc b/tests/forces/Test_momentum_filter.cc index 856ea0f2..794b5fa0 100644 --- a/tests/forces/Test_momentum_filter.cc +++ b/tests/forces/Test_momentum_filter.cc @@ -61,7 +61,9 @@ int main (int argc, char ** argv) std::vector seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); + GridSerialRNG sRNG; pRNG.SeedFixedIntegers(seeds); + sRNG.SeedFixedIntegers(seeds); typedef PeriodicGimplR Gimpl; typedef WilsonGaugeAction GaugeAction; @@ -115,7 +117,7 @@ int main (int argc, char ** argv) integrator.setMomentumFilter(filter); - integrator.refresh(U, pRNG); //doesn't actually change the gauge field + integrator.refresh(U, sRNG, pRNG); //doesn't actually change the gauge field //Check the momentum is zero on the boundary const auto &P = integrator.getMomentum(); From 009ccd581ede8faf0ba748fa49a1757419106e23 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Mon, 26 Apr 2021 10:36:33 +0100 Subject: [PATCH 175/201] bugfix 3D stout smearing --- Grid/qcd/smearing/StoutSmearing.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/smearing/StoutSmearing.h b/Grid/qcd/smearing/StoutSmearing.h index ed2ccdb6..629f81e2 100644 --- a/Grid/qcd/smearing/StoutSmearing.h +++ b/Grid/qcd/smearing/StoutSmearing.h @@ -89,11 +89,12 @@ public: SmearBase->smear(C, U); for (int mu = 0; mu < Nd; mu++) { - if( mu == OrthogDim ) + Umu = peekLorentz(U, mu); + if( mu == OrthogDim ){ tmp = 1.0; // Don't smear in the orthogonal direction + } else { tmp = peekLorentz(C, mu); - Umu = peekLorentz(U, mu); iq_mu = Ta( tmp * adj(Umu)); // iq_mu = Ta(Omega_mu) to match the signs with the paper From cf2923d5ddb9c190ebb78efadab281e5a06ba247 Mon Sep 17 00:00:00 2001 From: Felix Erben Date: Tue, 27 Apr 2021 16:53:37 +0100 Subject: [PATCH 176/201] Jamie's fix --- Grid/qcd/smearing/StoutSmearing.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/Grid/qcd/smearing/StoutSmearing.h b/Grid/qcd/smearing/StoutSmearing.h index 629f81e2..6ee78e8c 100644 --- a/Grid/qcd/smearing/StoutSmearing.h +++ b/Grid/qcd/smearing/StoutSmearing.h @@ -85,22 +85,18 @@ public: std::cout << GridLogDebug << "Stout smearing started\n"; - // Smear the configurations + // C contains the staples multiplied by some rho + u_smr = U ; // set the smeared field to the current gauge field SmearBase->smear(C, U); for (int mu = 0; mu < Nd; mu++) { + if( mu == OrthogDim ) continue ; + // u_smr = exp(iQ_mu)*U_mu apart from Orthogdim Umu = peekLorentz(U, mu); - if( mu == OrthogDim ){ - tmp = 1.0; // Don't smear in the orthogonal direction - } - else { - tmp = peekLorentz(C, mu); - iq_mu = Ta( - tmp * - adj(Umu)); // iq_mu = Ta(Omega_mu) to match the signs with the paper - exponentiate_iQ(tmp, iq_mu); - } - pokeLorentz(u_smr, tmp * Umu, mu); // u_smr = exp(iQ_mu)*U_mu + tmp = peekLorentz(C, mu); + iq_mu = Ta( tmp * adj(Umu)); + exponentiate_iQ(tmp, iq_mu); + pokeLorentz(u_smr, tmp * Umu, mu); } std::cout << GridLogDebug << "Stout smearing completed\n"; }; From 834f536b5f426aa0b3a334a89b37d8da39fb4238 Mon Sep 17 00:00:00 2001 From: u61464 Date: Tue, 4 May 2021 08:40:18 -0700 Subject: [PATCH 177/201] Fastest option on SyCL is now std::complex --- Grid/tensors/Tensor_SIMT.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Grid/tensors/Tensor_SIMT.h b/Grid/tensors/Tensor_SIMT.h index 672f385f..0a7d3382 100644 --- a/Grid/tensors/Tensor_SIMT.h +++ b/Grid/tensors/Tensor_SIMT.h @@ -65,7 +65,8 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ #else -#ifndef GRID_SYCL +//#ifndef GRID_SYCL +#if 1 // Use the scalar as our own complex on GPU ... thrust::complex or std::complex template = 0> accelerator_inline typename vsimd::scalar_type From 8cfc7342cde4b93d1de8f41a6f909ddd6d5d351f Mon Sep 17 00:00:00 2001 From: u61464 Date: Wed, 5 May 2021 14:17:18 -0700 Subject: [PATCH 178/201] staggered hand unroll read coalesce --- Grid/qcd/action/fermion/Fermion.h | 6 - Grid/qcd/action/fermion/FermionOperatorImpl.h | 5 +- .../implementation/StaggeredKernelsAsm.h | 5 +- .../implementation/StaggeredKernelsHand.h | 203 +++++++++--------- 4 files changed, 106 insertions(+), 113 deletions(-) diff --git a/Grid/qcd/action/fermion/Fermion.h b/Grid/qcd/action/fermion/Fermion.h index 16252340..09777204 100644 --- a/Grid/qcd/action/fermion/Fermion.h +++ b/Grid/qcd/action/fermion/Fermion.h @@ -291,12 +291,6 @@ typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DR; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DD; -#ifndef GRID_CUDA -typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dR; -typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dF; -typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dD; -#endif - NAMESPACE_END(Grid); //////////////////// diff --git a/Grid/qcd/action/fermion/FermionOperatorImpl.h b/Grid/qcd/action/fermion/FermionOperatorImpl.h index 9345c0e6..56aaca12 100644 --- a/Grid/qcd/action/fermion/FermionOperatorImpl.h +++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h @@ -183,7 +183,8 @@ NAMESPACE_CHECK(ImplStaggered); ///////////////////////////////////////////////////////////////////////////// // Single flavour one component spinors with colour index. 5d vec ///////////////////////////////////////////////////////////////////////////// -#include -NAMESPACE_CHECK(ImplStaggered5dVec); +// Deprecate Vec5d +//#include +//NAMESPACE_CHECK(ImplStaggered5dVec); diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h index 63fd2a2f..e9cacbcf 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h @@ -680,7 +680,8 @@ void StaggeredKernels::DhopSiteAsm(StencilView &st, gauge2 =(uint64_t)&UU[sU]( Z ); \ gauge3 =(uint64_t)&UU[sU]( T ); - +#undef STAG_VEC5D +#ifdef STAG_VEC5D // This is the single precision 5th direction vectorised kernel #include template <> void StaggeredKernels::DhopSiteAsm(StencilView &st, @@ -790,7 +791,7 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilView #endif } - +#endif #define PERMUTE_DIR3 __asm__ ( \ diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h index 6bcb22b4..2b6087bc 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h @@ -32,25 +32,50 @@ Author: paboyle NAMESPACE_BEGIN(Grid); -#define LOAD_CHI(b) \ +#ifdef GRID_SIMT + +#define LOAD_CHI(ptype,b) \ + const SiteSpinor & ref (b[offset]); \ + Chi_0=coalescedReadPermute(ref()()(0),perm,lane); \ + Chi_1=coalescedReadPermute(ref()()(1),perm,lane); \ + Chi_2=coalescedReadPermute(ref()()(2),perm,lane); + +#define LOAD_CHI_COMMS(b) \ const SiteSpinor & ref (b[offset]); \ - Chi_0=ref()()(0);\ - Chi_1=ref()()(1);\ - Chi_2=ref()()(2); + Chi_0=coalescedRead(ref()()(0),lane); \ + Chi_1=coalescedRead(ref()()(1),lane); \ + Chi_2=coalescedRead(ref()()(2),lane); + +#define PERMUTE_DIR(dir) ; +#else +#define LOAD_CHI(ptype,b) LOAD_CHI_COMMS(b) + +#define LOAD_CHI_COMMS(b) \ + const SiteSpinor & ref (b[offset]); \ + Chi_0=ref()()(0); \ + Chi_1=ref()()(1); \ + Chi_2=ref()()(2); + +#define PERMUTE_DIR(dir) \ + permute##dir(Chi_0,Chi_0); \ + permute##dir(Chi_1,Chi_1); \ + permute##dir(Chi_2,Chi_2); + +#endif // To splat or not to splat depends on the implementation #define MULT(A,UChi) \ auto & ref(U[sU](A)); \ - Impl::loadLinkElement(U_00,ref()(0,0)); \ - Impl::loadLinkElement(U_10,ref()(1,0)); \ - Impl::loadLinkElement(U_20,ref()(2,0)); \ - Impl::loadLinkElement(U_01,ref()(0,1)); \ - Impl::loadLinkElement(U_11,ref()(1,1)); \ - Impl::loadLinkElement(U_21,ref()(2,1)); \ - Impl::loadLinkElement(U_02,ref()(0,2)); \ - Impl::loadLinkElement(U_12,ref()(1,2)); \ - Impl::loadLinkElement(U_22,ref()(2,2)); \ + U_00=coalescedRead(ref()(0,0),lane); \ + U_10=coalescedRead(ref()(1,0),lane); \ + U_20=coalescedRead(ref()(2,0),lane); \ + U_01=coalescedRead(ref()(0,1),lane); \ + U_11=coalescedRead(ref()(1,1),lane); \ + U_21=coalescedRead(ref()(2,1),lane); \ + U_02=coalescedRead(ref()(0,2),lane); \ + U_12=coalescedRead(ref()(1,2),lane); \ + U_22=coalescedRead(ref()(2,2),lane); \ UChi ## _0 = U_00*Chi_0; \ UChi ## _1 = U_10*Chi_0;\ UChi ## _2 = U_20*Chi_0;\ @@ -63,15 +88,15 @@ NAMESPACE_BEGIN(Grid); #define MULT_ADD(U,A,UChi) \ auto & ref(U[sU](A)); \ - Impl::loadLinkElement(U_00,ref()(0,0)); \ - Impl::loadLinkElement(U_10,ref()(1,0)); \ - Impl::loadLinkElement(U_20,ref()(2,0)); \ - Impl::loadLinkElement(U_01,ref()(0,1)); \ - Impl::loadLinkElement(U_11,ref()(1,1)); \ - Impl::loadLinkElement(U_21,ref()(2,1)); \ - Impl::loadLinkElement(U_02,ref()(0,2)); \ - Impl::loadLinkElement(U_12,ref()(1,2)); \ - Impl::loadLinkElement(U_22,ref()(2,2)); \ + U_00=coalescedRead(ref()(0,0),lane); \ + U_10=coalescedRead(ref()(1,0),lane); \ + U_20=coalescedRead(ref()(2,0),lane); \ + U_01=coalescedRead(ref()(0,1),lane); \ + U_11=coalescedRead(ref()(1,1),lane); \ + U_21=coalescedRead(ref()(2,1),lane); \ + U_02=coalescedRead(ref()(0,2),lane); \ + U_12=coalescedRead(ref()(1,2),lane); \ + U_22=coalescedRead(ref()(2,2),lane); \ UChi ## _0 += U_00*Chi_0; \ UChi ## _1 += U_10*Chi_0;\ UChi ## _2 += U_20*Chi_0;\ @@ -83,24 +108,18 @@ NAMESPACE_BEGIN(Grid); UChi ## _2 += U_22*Chi_2; -#define PERMUTE_DIR(dir) \ - permute##dir(Chi_0,Chi_0); \ - permute##dir(Chi_1,Chi_1); \ - permute##dir(Chi_2,Chi_2); - - #define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ SE=st.GetEntry(ptype,Dir+skew,sF); \ offset = SE->_offset; \ local = SE->_is_local; \ perm = SE->_permute; \ if ( local ) { \ - LOAD_CHI(in); \ + LOAD_CHI(Perm,in); \ if ( perm) { \ PERMUTE_DIR(Perm); \ } \ } else { \ - LOAD_CHI(buf); \ + LOAD_CHI_COMMS(buf); \ } #define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \ @@ -116,19 +135,18 @@ NAMESPACE_BEGIN(Grid); } - #define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \ SE=st.GetEntry(ptype,Dir+skew,sF); \ offset = SE->_offset; \ local = SE->_is_local; \ perm = SE->_permute; \ if ( local ) { \ - LOAD_CHI(in); \ + LOAD_CHI(Perm,in); \ if ( perm) { \ PERMUTE_DIR(Perm); \ } \ } else if ( st.same_node[Dir] ) { \ - LOAD_CHI(buf); \ + LOAD_CHI_COMMS(buf); \ } \ if (local || st.same_node[Dir] ) { \ MULT_ADD(U,Dir,even); \ @@ -140,10 +158,32 @@ NAMESPACE_BEGIN(Grid); local = SE->_is_local; \ if ((!local) && (!st.same_node[Dir]) ) { \ nmu++; \ - { LOAD_CHI(buf); } \ + { LOAD_CHI_COMMS(buf); } \ { MULT_ADD(U,Dir,even); } \ } +#define HAND_DECLARATIONS(Simd) \ + Simd even_0; \ + Simd even_1; \ + Simd even_2; \ + Simd odd_0; \ + Simd odd_1; \ + Simd odd_2; \ + \ + Simd Chi_0; \ + Simd Chi_1; \ + Simd Chi_2; \ + \ + Simd U_00; \ + Simd U_10; \ + Simd U_20; \ + Simd U_01; \ + Simd U_11; \ + Simd U_21; \ + Simd U_02; \ + Simd U_12; \ + Simd U_22; + template template accelerator_inline @@ -155,28 +195,14 @@ void StaggeredKernels::DhopSiteHand(StencilView &st, typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - Simd even_0; // 12 regs on knc - Simd even_1; - Simd even_2; - Simd odd_0; // 12 regs on knc - Simd odd_1; - Simd odd_2; - Simd Chi_0; // two spinor; 6 regs - Simd Chi_1; - Simd Chi_2; - - Simd U_00; // two rows of U matrix - Simd U_10; - Simd U_20; - Simd U_01; - Simd U_11; - Simd U_21; // 2 reg left. - Simd U_02; - Simd U_12; - Simd U_22; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + typedef decltype( coalescedRead( in[0]()()(0) )) Simt; + HAND_DECLARATIONS(Simt); - SiteSpinor result; + typedef decltype( coalescedRead( in[0] )) calcSiteSpinor; + calcSiteSpinor result; int offset,local,perm, ptype; StencilEntry *SE; @@ -215,7 +241,7 @@ void StaggeredKernels::DhopSiteHand(StencilView &st, result()()(1) = even_1 + odd_1; result()()(2) = even_2 + odd_2; } - vstream(out[sF],result); + coalescedWrite(out[sF],result); } } @@ -230,28 +256,13 @@ void StaggeredKernels::DhopSiteHandInt(StencilView &st, typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - Simd even_0; // 12 regs on knc - Simd even_1; - Simd even_2; - Simd odd_0; // 12 regs on knc - Simd odd_1; - Simd odd_2; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + typedef decltype( coalescedRead( in[0]()()(0) )) Simt; + HAND_DECLARATIONS(Simt); - Simd Chi_0; // two spinor; 6 regs - Simd Chi_1; - Simd Chi_2; - - Simd U_00; // two rows of U matrix - Simd U_10; - Simd U_20; - Simd U_01; - Simd U_11; - Simd U_21; // 2 reg left. - Simd U_02; - Simd U_12; - Simd U_22; - - SiteSpinor result; + typedef decltype( coalescedRead( in[0] )) calcSiteSpinor; + calcSiteSpinor result; int offset, ptype, local, perm; StencilEntry *SE; @@ -261,8 +272,8 @@ void StaggeredKernels::DhopSiteHandInt(StencilView &st, // int sF=s+LLs*sU; { - even_0 = Zero(); even_1 = Zero(); even_2 = Zero(); - odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero(); + zeroit(even_0); zeroit(even_1); zeroit(even_2); + zeroit(odd_0); zeroit(odd_1); zeroit(odd_2); skew = 0; HAND_STENCIL_LEG_INT(U,Xp,3,skew,even); @@ -294,7 +305,7 @@ void StaggeredKernels::DhopSiteHandInt(StencilView &st, result()()(1) = even_1 + odd_1; result()()(2) = even_2 + odd_2; } - vstream(out[sF],result); + coalescedWrite(out[sF],result); } } @@ -309,28 +320,13 @@ void StaggeredKernels::DhopSiteHandExt(StencilView &st, typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - Simd even_0; // 12 regs on knc - Simd even_1; - Simd even_2; - Simd odd_0; // 12 regs on knc - Simd odd_1; - Simd odd_2; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + typedef decltype( coalescedRead( in[0]()()(0) )) Simt; + HAND_DECLARATIONS(Simt); - Simd Chi_0; // two spinor; 6 regs - Simd Chi_1; - Simd Chi_2; - - Simd U_00; // two rows of U matrix - Simd U_10; - Simd U_20; - Simd U_01; - Simd U_11; - Simd U_21; // 2 reg left. - Simd U_02; - Simd U_12; - Simd U_22; - - SiteSpinor result; + typedef decltype( coalescedRead( in[0] )) calcSiteSpinor; + calcSiteSpinor result; int offset, ptype, local; StencilEntry *SE; @@ -340,8 +336,8 @@ void StaggeredKernels::DhopSiteHandExt(StencilView &st, // int sF=s+LLs*sU; { - even_0 = Zero(); even_1 = Zero(); even_2 = Zero(); - odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero(); + zeroit(even_0); zeroit(even_1); zeroit(even_2); + zeroit(odd_0); zeroit(odd_1); zeroit(odd_2); int nmu=0; skew = 0; HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even); @@ -374,7 +370,7 @@ void StaggeredKernels::DhopSiteHandExt(StencilView &st, result()()(1) = even_1 + odd_1; result()()(2) = even_2 + odd_2; } - out[sF] = out[sF] + result; + coalescedWrite(out[sF] , out(sF)+ result); } } } @@ -397,6 +393,7 @@ void StaggeredKernels::DhopSiteHandExt(StencilView &st, const FermionFieldView &in, FermionFieldView &out, int dag); \ */ #undef LOAD_CHI +#undef HAND_DECLARATIONS NAMESPACE_END(Grid); From 0e27e3847d6252c7a950e59517a8af0bf1e15549 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Thu, 3 Jun 2021 04:24:19 +0000 Subject: [PATCH 179/201] Remove synch --- Grid/threads/Accelerator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 56b85c72..b76d6d1c 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -457,7 +457,7 @@ accelerator_inline void acceleratorSynchronise(void) __syncwarp(); #endif #ifdef GRID_SYCL - cl::sycl::detail::workGroupBarrier(); + //cl::sycl::detail::workGroupBarrier(); #endif #ifdef GRID_HIP __syncthreads(); From ca10bfa1c7f6615bcc13322ee5e050c6e4b37ad8 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Fri, 4 Jun 2021 11:12:22 +0100 Subject: [PATCH 180/201] removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore) --- .travis.yml | 56 ----------------------------------------------------- 1 file changed, 56 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 3a0e1e35..00000000 --- a/.travis.yml +++ /dev/null @@ -1,56 +0,0 @@ -language: cpp - -cache: - directories: - - clang - -matrix: - include: - - os: osx - osx_image: xcode8.3 - compiler: clang - -before_install: - - export GRIDDIR=`pwd` - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi - -install: - - export CWD=`pwd` - - echo $CWD - - export CC=$CC$VERSION - - export CXX=$CXX$VERSION - - echo $PATH - - which autoconf - - autoconf --version - - which automake - - automake --version - - which $CC - - $CC --version - - which $CXX - - $CXX --version - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi - -script: - - ./bootstrap.sh - - mkdir build - - cd build - - mkdir lime - - cd lime - - mkdir build - - cd build - - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz - - tar xf lime-1.3.2.tar.gz - - cd lime-1.3.2 - - ./configure --prefix=$CWD/build/lime/install - - make -j4 - - make install - - cd $CWD/build - - ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF} - - make -j4 - - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - - make check From 92def28bd3331153da2b8a2414f471e4f7831a4c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 6 Jun 2021 04:52:05 -0400 Subject: [PATCH 181/201] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fff68dc6..88b922a5 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid) +# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview) **Data parallel C++ mathematical object library.** From 4c5440fb0678b3a936ebe95f2c891d90b62feaaf Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 15 Jun 2021 21:45:07 +0000 Subject: [PATCH 182/201] const happy for sycl --- Grid/tensors/Tensor_extract_merge.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Grid/tensors/Tensor_extract_merge.h b/Grid/tensors/Tensor_extract_merge.h index f1ded209..ea619d0f 100644 --- a/Grid/tensors/Tensor_extract_merge.h +++ b/Grid/tensors/Tensor_extract_merge.h @@ -1,5 +1,5 @@ /************************************************************************************* - +n Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/tensors/Tensor_extract_merge.h @@ -153,7 +153,7 @@ void insertLane(int lane, vobj & __restrict__ vec,const typename vobj::scalar_ob // Extract to a bunch of scalar object pointers of different scalar type, with offset. Useful for precision change //////////////////////////////////////////////////////////////////////// template accelerator -void extract(const vobj &vec,ExtractPointerArray &extracted, int offset) +void extract(const vobj &vec,const ExtractPointerArray &extracted, int offset) { typedef typename GridTypeMapper::scalar_type sobj_scalar_type; typedef typename GridTypeMapper::scalar_type scalar_type; @@ -181,7 +181,7 @@ void extract(const vobj &vec,ExtractPointerArray &extracted, int offset) // Merge bunch of scalar object pointers of different scalar type, with offset. Useful for precision change //////////////////////////////////////////////////////////////////////// template accelerator -void merge(vobj &vec,ExtractPointerArray &extracted, int offset) +void merge(vobj &vec,const ExtractPointerArray &extracted, int offset) { typedef typename GridTypeMapper::scalar_type sobj_scalar_type; typedef typename GridTypeMapper::scalar_type scalar_type; From 6cd9224dd78aca959d3997479287cf943832d79c Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 16 Jun 2021 17:10:55 +0000 Subject: [PATCH 183/201] SYCL comms buffer allocate --- Grid/communicator/SharedMemoryMPI.cc | 54 +++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 466f6a1e..786122fa 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -35,6 +35,9 @@ Author: Christoph Lehner #endif #ifdef GRID_HIP #include +#endif +#ifdef GRID_SYCl + #endif NAMESPACE_BEGIN(Grid); @@ -446,7 +449,46 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) //////////////////////////////////////////////////////////////////////////////////////////// // Hugetlbfs mapping intended //////////////////////////////////////////////////////////////////////////////////////////// -#if defined(GRID_CUDA) ||defined(GRID_HIP) +#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL) + +#if defined(GRID_SYCL) +void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) +{ + void * ShmCommBuf ; + assert(_ShmSetup==1); + assert(_ShmAlloc==0); + + ////////////////////////////////////////////////////////////////////////////////////////////////////////// + // allocate the pointer array for shared windows for our group + ////////////////////////////////////////////////////////////////////////////////////////////////////////// + MPI_Barrier(WorldShmComm); + WorldShmCommBufs.resize(WorldShmSize); + + /////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Each MPI rank should allocate our own buffer + /////////////////////////////////////////////////////////////////////////////////////////////////////////// + ShmCommBuf = acceleratorAllocDevice(bytes); + + if (ShmCommBuf == (void *)NULL ) { + std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; + exit(EXIT_FAILURE); + } + // if ( WorldRank == 0 ){ + if ( 1 ){ + std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes + << "bytes at "<< std::hex<< ShmCommBuf < Date: Tue, 22 Jun 2021 17:56:10 +0000 Subject: [PATCH 184/201] Force reqd subgroup size fo SYCL --- Grid/threads/Accelerator.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index b76d6d1c..c0af1019 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -257,11 +257,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { unsigned long nt=acceleratorThreads(); \ unsigned long unum1 = num1; \ unsigned long unum2 = num2; \ + if(nt < 8)nt=8; \ cl::sycl::range<3> local {nt,1,nsimd}; \ cl::sycl::range<3> global{unum1,unum2,nsimd}; \ cgh.parallel_for( \ cl::sycl::nd_range<3>(global,local), \ - [=] (cl::sycl::nd_item<3> item) /*mutable*/ { \ + [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ + [[intel::reqd_sub_group_size(8)]] \ + { \ auto iter1 = item.get_global_id(0); \ auto iter2 = item.get_global_id(1); \ auto lane = item.get_global_id(2); \ From 29a22ae603a3cf18d2ebeba2eb5aabcf27fe3e5d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 22 Jun 2021 17:57:20 +0000 Subject: [PATCH 185/201] Simpler SYCL setup --- Grid/communicator/SharedMemoryMPI.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index 786122fa..caa03a60 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -473,13 +473,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; exit(EXIT_FAILURE); } - // if ( WorldRank == 0 ){ - if ( 1 ){ - std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes - << "bytes at "<< std::hex<< ShmCommBuf < Date: Thu, 5 Aug 2021 18:33:20 -0400 Subject: [PATCH 186/201] Check is wrong (HtoD / DtoH) --- Grid/communicator/SharedMemoryMPI.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index caa03a60..d7b41cee 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -844,7 +844,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) } #endif - SharedMemoryTest(); + //SharedMemoryTest(); } ////////////////////////////////////////////////////////////////// // On node barrier From fe5aaf7677c11e917af03bd0cbbc14a29b67bd2d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 9 Aug 2021 04:06:30 -0700 Subject: [PATCH 187/201] Make comms benchmark same as Benchmark_comms_host_device --- benchmarks/Benchmark_ITT.cc | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 032535b3..7311dfc4 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -133,15 +133,15 @@ public: std::vector xbuf(8); std::vector rbuf(8); - Grid.ShmBufferFreeAll(); + //Grid.ShmBufferFreeAll(); + uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); for(int d=0;d<8;d++){ - xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); - rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); // bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); // bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); } - int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); int ncomm; double dbytes; std::vector times(Nloop); @@ -152,7 +152,7 @@ public: dbytes=0; ncomm=0; - thread_for(dir,8,{ + for(int dir=0;dir<8;dir++) { double tbytes; int mu =dir % 4; @@ -168,15 +168,16 @@ public: int comm_proc = mpi_layout[mu]-1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); } - tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, - (void *)&rbuf[dir][0], recv_from_rank, - bytes,dir); + Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, + (void *)&rbuf[dir][0], recv_from_rank, + bytes); + tbytes = bytes; thread_critical { ncomm++; dbytes+=tbytes; } } - }); + }; Grid.Barrier(); double stop=usecond(); t_time[i] = stop-start; // microseconds @@ -196,8 +197,12 @@ public: << "\t\t"<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " " << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; - } - } + for(int d=0;d<8;d++){ + acceleratorFreeDevice(xbuf[d]); + acceleratorFreeDevice(rbuf[d]); + } + } + } return; } @@ -281,7 +286,6 @@ public: uint64_t lmax=32; -#define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat) GridSerialRNG sRNG; sRNG.SeedFixedIntegers(std::vector({45,12,81,9})); for(int lat=8;lat<=lmax;lat+=8){ From 75030637cca1f60173ca760b09dd37852a88299d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 10 Aug 2021 05:16:30 -0700 Subject: [PATCH 188/201] Improved comms benchmark, same as benchmark_comms_host_device --- benchmarks/Benchmark_ITT.cc | 73 ++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 42 deletions(-) diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 7311dfc4..81d1acd4 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -144,23 +144,19 @@ public: int ncomm; double dbytes; - std::vector times(Nloop); - for(int i=0;i1 ) { - dbytes=0; - ncomm=0; + std::vector times(Nloop); + for(int i=0;i1 ) { - + dbytes=0; + double start=usecond(); int xmit_to_rank; int recv_from_rank; + if ( dir == mu ) { int comm_proc=1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); @@ -171,42 +167,35 @@ public: Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, (void *)&rbuf[dir][0], recv_from_rank, bytes); - tbytes = bytes; - thread_critical { - ncomm++; - dbytes+=tbytes; - } + dbytes+=bytes; + + double stop=usecond(); + t_time[i] = stop-start; // microseconds + } - }; - Grid.Barrier(); - double stop=usecond(); - t_time[i] = stop-start; // microseconds + timestat.statistics(t_time); + + dbytes=dbytes*ppn; + double xbytes = dbytes*0.5; + double bidibytes = dbytes; + + std::cout<1) ) { + if ( do_comms ) { std::cout< Date: Tue, 10 Aug 2021 05:35:15 -0700 Subject: [PATCH 189/201] Level 0 IPC set up --- Grid/communicator/SharedMemoryMPI.cc | 56 +++++++++++++++++++++++++--- Grid/threads/Accelerator.cc | 5 ++- Grid/threads/Accelerator.h | 7 ++++ 3 files changed, 62 insertions(+), 6 deletions(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index caa03a60..554f338b 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -73,6 +73,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) WorldNodes = WorldSize/WorldShmSize; assert( (WorldNodes * WorldShmSize) == WorldSize ); + // FIXME: Check all WorldShmSize are the same ? ///////////////////////////////////////////////////////////////////// @@ -451,7 +452,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) //////////////////////////////////////////////////////////////////////////////////////////// #if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL) -#if defined(GRID_SYCL) +//if defined(GRID_SYCL) +#if 0 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) { void * ShmCommBuf ; @@ -488,7 +490,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) } #endif -#if defined(GRID_CUDA) ||defined(GRID_HIP) +#if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) { void * ShmCommBuf ; @@ -511,8 +513,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) /////////////////////////////////////////////////////////////////////////////////////////////////////////// // Each MPI rank should allocate our own buffer /////////////////////////////////////////////////////////////////////////////////////////////////////////// + auto zeDevice = cl::sycl::get_native(theGridAccelerator->get_device()); + auto zeContext= cl::sycl::get_native(theGridAccelerator->get_context()); +#ifdef GRID_SYCL_LEVEL_ZERO_IPC + ze_device_mem_alloc_desc_t zeDesc = {}; + zeMemAllocDevice(zeContext,&zeDesc,bytes,2*1024*1024,zeDevice,&ShmCommBuf); + std::cout << WorldRank << header " SharedMemoryMPI.cc zeMemAllocDevice "<< bytes + << "bytes at "<< std::hex<< ShmCommBuf < #include + +#define GRID_SYCL_LEVEL_ZERO_IPC + +#ifdef GRID_SYCL_LEVEL_ZERO_IPC +#include +#include +#endif NAMESPACE_BEGIN(Grid); extern cl::sycl::queue *theGridAccelerator; From 5d29e175d82edc92af07ca9b708ec7f187fc006d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 10 Aug 2021 18:25:43 +0100 Subject: [PATCH 190/201] Typo fix --- Grid/communicator/SharedMemoryMPI.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/communicator/SharedMemoryMPI.cc b/Grid/communicator/SharedMemoryMPI.cc index d230b2a5..11788744 100644 --- a/Grid/communicator/SharedMemoryMPI.cc +++ b/Grid/communicator/SharedMemoryMPI.cc @@ -513,9 +513,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) /////////////////////////////////////////////////////////////////////////////////////////////////////////// // Each MPI rank should allocate our own buffer /////////////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef GRID_SYCL_LEVEL_ZERO_IPC auto zeDevice = cl::sycl::get_native(theGridAccelerator->get_device()); auto zeContext= cl::sycl::get_native(theGridAccelerator->get_context()); -#ifdef GRID_SYCL_LEVEL_ZERO_IPC ze_device_mem_alloc_desc_t zeDesc = {}; zeMemAllocDevice(zeContext,&zeDesc,bytes,2*1024*1024,zeDevice,&ShmCommBuf); std::cout << WorldRank << header " SharedMemoryMPI.cc zeMemAllocDevice "<< bytes From ffbdd91e0e2770f0e12f9296e5e9991e52799f4d Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 20 Aug 2021 01:15:00 +0100 Subject: [PATCH 191/201] Apple happiness --- Grid/util/Init.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index bfbc464d..3b661524 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -52,10 +52,12 @@ Author: paboyle #include -#ifdef __APPLE__ static int +#ifdef __APPLE__ feenableexcept (unsigned int excepts) { +#if 0 + // Fails on Apple M1 static fenv_t fenv; unsigned int new_excepts = excepts & FE_ALL_EXCEPT; unsigned int old_excepts; // previous masks @@ -70,6 +72,7 @@ feenableexcept (unsigned int excepts) iold_excepts = (int) old_excepts; return ( fesetenv (&fenv) ? -1 : iold_excepts ); +#endif } #endif From 7163b31a26bba90241480fe444a3584cd66f5893 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 20 Aug 2021 01:15:23 +0100 Subject: [PATCH 192/201] Examples --- scripts/filelist | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/scripts/filelist b/scripts/filelist index 8e29ba88..c3f0cf3f 100755 --- a/scripts/filelist +++ b/scripts/filelist @@ -82,3 +82,18 @@ for f in $TESTS; do echo >> Make.inc done cd .. + +# examples Make.inc +cd $home/examples/ +echo> Make.inc +TESTS=`ls *.cc` +TESTLIST=`echo ${TESTS} | sed s/.cc//g ` +echo bin_PROGRAMS = ${TESTLIST} > Make.inc +echo >> Make.inc +for f in $TESTS; do + BNAME=`basename $f .cc` + echo ${BNAME}_SOURCES=$f >> Make.inc + echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a'>> Make.inc + echo >> Make.inc +done +cd .. From 40098424c75cb693cf92ebafa758efac8b25053a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Sun, 22 Aug 2021 14:17:12 +0100 Subject: [PATCH 193/201] Examples --- Makefile.am | 2 +- configure.ac | 1 + examples/Example_Laplacian.cc | 395 ++++++++++++++++++++++++++++++++++ examples/Makefile.am | 6 + 4 files changed, 403 insertions(+), 1 deletion(-) create mode 100644 examples/Example_Laplacian.cc create mode 100644 examples/Makefile.am diff --git a/Makefile.am b/Makefile.am index 33b25026..d2a1a326 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,5 +1,5 @@ # additional include paths necessary to compile the C++ library -SUBDIRS = Grid HMC benchmarks tests +SUBDIRS = Grid HMC benchmarks tests examples include $(top_srcdir)/doxygen.inc diff --git a/configure.ac b/configure.ac index 4e5e33c8..721d890e 100644 --- a/configure.ac +++ b/configure.ac @@ -815,6 +815,7 @@ AC_CONFIG_FILES(tests/smearing/Makefile) AC_CONFIG_FILES(tests/qdpxx/Makefile) AC_CONFIG_FILES(tests/testu01/Makefile) AC_CONFIG_FILES(benchmarks/Makefile) +AC_CONFIG_FILES(examples/Makefile) AC_OUTPUT echo "" diff --git a/examples/Example_Laplacian.cc b/examples/Example_Laplacian.cc new file mode 100644 index 00000000..f6bbf7cf --- /dev/null +++ b/examples/Example_Laplacian.cc @@ -0,0 +1,395 @@ +#include +using namespace Grid; + +/* +///////////////////////////////////////////////////////////////////////////////////////////// +// Grid/algorithms/SparseMatrix.h: Interface defining what I expect of a general sparse matrix, such as a Fermion action +///////////////////////////////////////////////////////////////////////////////////////////// +template class SparseMatrixBase { +public: + virtual GridBase *Grid(void) =0; + + virtual void M (const Field &in, Field &out)=0; + virtual void Mdag (const Field &in, Field &out)=0; + virtual void MdagM(const Field &in, Field &out) { + Field tmp (in.Grid()); + M(in,tmp); + Mdag(tmp,out); + } + virtual void Mdiag (const Field &in, Field &out)=0; + virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; + virtual void MdirAll (const Field &in, std::vector &out)=0; +}; +*/ + +const std::vector directions ({Xdir,Ydir,Zdir,Xdir,Ydir,Zdir}); +const std::vector displacements({1,1,1,-1,-1,-1}); + +template class FreeLaplacianCshift : public SparseMatrixBase +{ +public: + GridBase *grid; + FreeLaplacianCshift(GridBase *_grid) + { + grid=_grid; + }; + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out = Zero(); + for(int mu=0;mu &out) {assert(0);}; // Unimplemented need only for multigrid +}; + +template class CovariantLaplacianCshift : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + GridBase *grid; + GaugeField U; + + CovariantLaplacianCshift(GaugeField &_U) : + grid(_U.Grid()), + U(_U) { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out=Zero(); + for(int mu=0;mu(U, mu); // NB: Inefficent + out = out + Gimpl::CovShiftForward(Umu,mu,in); + out = out + Gimpl::CovShiftBackward(Umu,mu,in); + out = out - 2.0*in; + } + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + + +#define LEG_LOAD(Dir) \ + SE = st.GetEntry(ptype, Dir, ss); \ + if (SE->_is_local ) { \ + int perm= SE->_permute; \ + chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \ + } else { \ + chi = coalescedRead(buf[SE->_offset],lane); \ + } \ + acceleratorSynchronise(); + +template class FreeLaplacianStencil : public SparseMatrixBase +{ +public: + typedef typename Field::vector_object siteObject; + typedef CartesianStencil StencilImpl; + + GridBase *grid; + StencilImpl Stencil; + SimpleCompressor Compressor; + + FreeLaplacianStencil(GridBase *_grid) + : Stencil (_grid,6,Even,directions,displacements,0), grid(_grid) + { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &_in, Field &_out) + { + + /////////////////////////////////////////////// + // Halo exchange for this geometry of stencil + /////////////////////////////////////////////// + Stencil.HaloExchange(_in, Compressor); + + /////////////////////////////////// + // Arithmetic expressions + /////////////////////////////////// + StencilEntry *SE; + + // Views; device friendly/accessible pointers + auto st = Stencil.View(AcceleratorRead); + auto buf = st.CommBuf(); + autoView( in , _in , AcceleratorRead); + autoView( out , _out , AcceleratorWrite); + + typedef typename Field::vector_object vobj; + typedef decltype(coalescedRead(in[0])) calcObj; + + const int Nsimd = vobj::Nsimd(); + const uint64_t NN = grid->oSites(); + const int lane=acceleratorSIMTlane(Nsimd); + + accelerator_for( ss, NN, Nsimd, { + + const int lane=acceleratorSIMTlane(Nsimd); + + calcObj chi; + calcObj res; + int ptype; + + res = coalescedRead(in[ss])*(-6.0); + LEG_LOAD(0); res = res + chi; + LEG_LOAD(1); res = res + chi; + LEG_LOAD(2); res = res + chi; + LEG_LOAD(3); res = res + chi; + LEG_LOAD(4); res = res + chi; + LEG_LOAD(5); res = res + chi; + + coalescedWrite(out[ss], res,lane); + + }); + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + +template class CovariantLaplacianStencil : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + typedef typename Field::vector_object siteObject; + + template using iImplDoubledGaugeField = iVector >, Nds>; + typedef iImplDoubledGaugeField SiteDoubledGaugeField; + typedef Lattice DoubledGaugeField; + + typedef CartesianStencil StencilImpl; + + GridBase *grid; + StencilImpl Stencil; + SimpleCompressor Compressor; + DoubledGaugeField Uds; + CovariantLaplacianStencil(GaugeField &Umu) + : + grid(Umu.Grid()), + Stencil (grid,6,Even,directions,displacements,0), + Uds(grid) + { + for (int mu = 0; mu < Nd; mu++) { + auto U = PeekIndex(Umu, mu); + PokeIndex(Uds, U, mu ); + U = adj(Cshift(U, mu, -1)); + PokeIndex(Uds, U, mu + 4); + } + }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &_in, Field &_out) + { + /////////////////////////////////////////////// + // Halo exchange for this geometry of stencil + /////////////////////////////////////////////// + Stencil.HaloExchange(_in, Compressor); + + /////////////////////////////////// + // Arithmetic expressions + /////////////////////////////////// + auto st = Stencil.View(AcceleratorRead); + auto buf = st.CommBuf(); + StencilEntry *SE; + + autoView( in , _in , AcceleratorRead); + autoView( out , _out , AcceleratorWrite); + autoView( U , Uds , AcceleratorRead); + + typedef typename Field::vector_object vobj; + typedef decltype(coalescedRead(in[0])) calcObj; + typedef decltype(coalescedRead(U[0](0))) calcLink; + + const int Nsimd = vobj::Nsimd(); + const uint64_t NN = grid->oSites(); + const int lane=acceleratorSIMTlane(Nsimd); + accelerator_for( ss, NN, Nsimd, { + + const int lane=acceleratorSIMTlane(Nsimd); + + calcObj chi; + calcObj res; + calcObj Uchi; + calcLink UU; + int ptype; + + res = coalescedRead(in[ss])*(-6.0); + +#define LEG_LOAD_MULT(leg,polarisation) \ + UU = coalescedRead(U[ss](polarisation)); \ + LEG_LOAD(leg); \ + mult(&Uchi(), &UU, &chi()); \ + res = res + Uchi; + + LEG_LOAD_MULT(0,Xp); + LEG_LOAD_MULT(1,Yp); + LEG_LOAD_MULT(2,Zp); + LEG_LOAD_MULT(3,Xm); + LEG_LOAD_MULT(4,Ym); + LEG_LOAD_MULT(5,Zm); + + coalescedWrite(out[ss], res,lane); + }); + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + +#undef LEG_LOAD_MULT +#undef LEG_LOAD + +int main(int argc, char ** argv) +{ + Grid_init(&argc, &argv); + + typedef LatticeColourVector Field; + + auto latt_size = GridDefaultLatt(); + auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + auto mpi_layout = GridDefaultMpi(); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridParallelRNG RNG(&Grid); RNG.SeedFixedIntegers(std::vector({45,12,81,9})); + + FreeLaplacianCshift FLcs(&Grid); + FreeLaplacianStencil FLst(&Grid); + + LatticeGaugeField U(&Grid); + + SU::ColdConfiguration(RNG,U); + + std::cout << " Gauge field has norm " < CLcs(U); + CovariantLaplacianStencil CLst(U); + + Field in(&Grid); gaussian(RNG,in); + Field out_FLcs(&Grid); + Field out_FLst(&Grid); + Field out_CLcs(&Grid); + Field out_CLst(&Grid); + Field diff(&Grid); + + //////////////////////////////////////////////////////// + // First test: in free field these should all agree + //////////////////////////////////////////////////////// + FLcs.M(in,out_FLcs); + FLst.M(in,out_FLst); + CLcs.M(in,out_CLcs); + CLst.M(in,out_CLst); + + std:: cout << "******************************************************************" <::RandomGaugeTransform(RNG,U_GT,g); // Unit gauge + + Field in_GT(&Grid); + Field out_GT(&Grid); + + Field out_CLcs_GT(&Grid); + Field out_CLst_GT(&Grid); + + CovariantLaplacianCshift CLcs_GT(U_GT); + CovariantLaplacianStencil CLst_GT(U_GT); + + in_GT = g*in; + out_GT = g*out_FLcs; + + // Check M^GT_xy in_GT = g(x) M_xy g^dag(y) g(y) in = g(x) out(x) + CLcs_GT.M(in_GT,out_CLcs_GT); + CLst_GT.M(in_GT,out_CLst_GT); + + diff = out_CLcs_GT - out_GT; + std:: cout << " Difference between Gauge xformed result and covariant Cshift Laplacian in xformed gauge = " < dim_mask({1,1,1,0}); // 3d FFT + FFT theFFT(&Grid); + Field out(&Grid); + Field F_out(&Grid); + Field F_in(&Grid); + + // FFT the random input vector + theFFT.FFT_dim_mask(F_in,in,dim_mask,FFT::forward); + + // Convolution theorem: multiply by Fourier representation of (discrete) Laplacian to apply diff op + LatticeComplexD lap(&Grid); lap = Zero(); + LatticeComplexD kmu(&Grid); + ComplexD ci(0.0,1.0); + for(int mu=0;mu<3;mu++) { + + RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; + + LatticeCoordinate(kmu,mu); + kmu = TwoPiL * kmu; + + // (e^ik_mu + e^-ik_mu - 2) = 2( cos kmu - 1) ~ 2 (1 - k_mu^2/2 -1 ) = - k_mu^2 + O(k^4) + lap = lap + 2.0*cos(kmu) - 2.0; + + } + F_out = lap * F_in; + + // Inverse FFT the result + theFFT.FFT_dim_mask(out,F_out,dim_mask,FFT::backward); + + std::cout<<"Fourier xformed (in) "< Date: Sun, 22 Aug 2021 18:25:07 +0100 Subject: [PATCH 194/201] Extra examples / solutions --- examples/Example_Laplacian_inverter.cc | 122 ++++++++++++++++++++++++ examples/Example_Laplacian_smearing.cc | 127 +++++++++++++++++++++++++ examples/Example_Laplacian_solver.cc | 127 +++++++++++++++++++++++++ 3 files changed, 376 insertions(+) create mode 100644 examples/Example_Laplacian_inverter.cc create mode 100644 examples/Example_Laplacian_smearing.cc create mode 100644 examples/Example_Laplacian_solver.cc diff --git a/examples/Example_Laplacian_inverter.cc b/examples/Example_Laplacian_inverter.cc new file mode 100644 index 00000000..1235d2b8 --- /dev/null +++ b/examples/Example_Laplacian_inverter.cc @@ -0,0 +1,122 @@ +#include +using namespace Grid; + +// Function used for Chebyshev smearing +// +Real MomentumSmearing(Real p2) +{ + return (1 - 4.0*p2) * exp(-p2/4); +} +Real DistillationSmearing(Real p2) +{ + if ( p2 > 0.5 ) return 0.0; + else return 1.0; +} + +// Flip sign to make prop to p^2, not -p^2 relative to last example +template class CovariantLaplacianCshift : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + GridBase *grid; + GaugeField U; + + CovariantLaplacianCshift(GaugeField &_U) : + grid(_U.Grid()), + U(_U) { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out=Zero(); + for(int mu=0;mu(U, mu); // NB: Inefficent + out = out - Gimpl::CovShiftForward(Umu,mu,in); + out = out - Gimpl::CovShiftBackward(Umu,mu,in); + out = out + 2.0*in; + } + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + + + +int main(int argc, char ** argv) +{ + Grid_init(&argc, &argv); + + typedef LatticeColourVector Field; + + auto latt_size = GridDefaultLatt(); + auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + auto mpi_layout = GridDefaultMpi(); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridParallelRNG RNG(&Grid); RNG.SeedFixedIntegers(std::vector({45,12,81,9})); + + + LatticeGaugeField U(&Grid); + + SU::ColdConfiguration(RNG,U); + + typedef CovariantLaplacianCshift Laplacian_t; + Laplacian_t Laplacian(U); + + ColourVector ColourKronecker; + ColourKronecker = Zero(); + ColourKronecker()()(0) = 1.0; + + Coordinate site({latt_size[0]/2, + latt_size[1]/2, + latt_size[2]/2, + 0}); + + Field kronecker(&Grid); + kronecker = Zero(); + pokeSite(ColourKronecker,kronecker,site); + + + Field psi(&Grid), chi(&Grid); + + ////////////////////////////////////// + // Classic Wuppertal smearing + ////////////////////////////////////// + + Integer Iterations = 80; + Real width = 2.0; + Real coeff = (width*width) / Real(4*Iterations); + + chi=kronecker; + // chi = (1-p^2/2N)^N kronecker + for(int n = 0; n < Iterations; ++n) { + Laplacian.M(chi,psi); + chi = chi - coeff*psi; + } + + std::cout << " Wuppertal smeared operator is chi = \n" << chi < HermOp(Laplacian); + + Chebyshev ChebySmear(lo,hi,20,DistillationSmearing); + // Chebyshev ChebySmear(lo,hi,20,MomentumSmearing); + { + std::ofstream of("chebysmear"); + ChebySmear.csv(of); + } + + ChebySmear(HermOp,kronecker,chi); + + std::cout << " Chebyshev smeared operator is chi = \n" << chi < +using namespace Grid; + +// Function used for Chebyshev smearing +// +Real MomentumSmearing(Real p2) +{ + return (1 - 4.0*p2) * exp(-p2/4); +} +Real DistillationSmearing(Real p2) +{ + if ( p2 > 0.5 ) return 0.0; + else return 1.0; +} + +// Flip sign to make prop to p^2, not -p^2 relative to last example +template class CovariantLaplacianCshift : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + GridBase *grid; + GaugeField U; + + CovariantLaplacianCshift(GaugeField &_U) : + grid(_U.Grid()), + U(_U) { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out=Zero(); + for(int mu=0;mu(U, mu); // NB: Inefficent + out = out - Gimpl::CovShiftForward(Umu,mu,in); + out = out - Gimpl::CovShiftBackward(Umu,mu,in); + out = out + 2.0*in; + } + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + + + +int main(int argc, char ** argv) +{ + Grid_init(&argc, &argv); + + typedef LatticeColourVector Field; + + auto latt_size = GridDefaultLatt(); + auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + auto mpi_layout = GridDefaultMpi(); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridParallelRNG RNG(&Grid); RNG.SeedFixedIntegers(std::vector({45,12,81,9})); + + + LatticeGaugeField U(&Grid); + + SU::ColdConfiguration(RNG,U); + + typedef CovariantLaplacianCshift Laplacian_t; + Laplacian_t Laplacian(U); + + + ColourVector ColourKronecker; + ColourKronecker = Zero(); + ColourKronecker()()(0) = 1.0; + + Coordinate site({latt_size[0]/2, + latt_size[1]/2, + latt_size[2]/2, + 0}); + + Field kronecker(&Grid); + kronecker = Zero(); + pokeSite(ColourKronecker,kronecker,site); + + + Field psi(&Grid), chi(&Grid); + + ////////////////////////////////////// + // Classic Wuppertal smearing + ////////////////////////////////////// + + Integer Iterations = 80; + Real width = 2.0; + Real coeff = (width*width) / Real(4*Iterations); + + chi=kronecker; + // chi = (1-p^2/2N)^N kronecker + for(int n = 0; n < Iterations; ++n) { + Laplacian.M(chi,psi); + chi = chi - coeff*psi; + } + + std::cout << " Wuppertal smeared operator is chi = \n" << chi < HermOp(Laplacian); + + std::cout << " Checking spectral range of our POSITIVE definite operator \n"; + PowerMethod PM; + PM(HermOp,kronecker); + + Chebyshev ChebySmear(lo,hi,20,DistillationSmearing); + // Chebyshev ChebySmear(lo,hi,20,MomentumSmearing); + { + std::ofstream of("chebysmear"); + ChebySmear.csv(of); + } + + ChebySmear(HermOp,kronecker,chi); + + std::cout << " Chebyshev smeared operator is chi = \n" << chi < +using namespace Grid; + +template +void SimpleConjugateGradient(LinearOperatorBase &HPDop,const Field &b, Field &x) +{ + RealD cp, c, alpha, d, beta, ssq, qq; + RealD Tolerance=1.0e-10; + int MaxIterations=10000; + + Field p(b), mmp(b), r(b); + + HPDop.HermOpAndNorm(x, mmp, d, beta); + + r = b - mmp; + p = r; + + alpha = norm2(p); + cp = alpha; + ssq = norm2(b); + + RealD rsq = Tolerance * Tolerance * ssq; + + for (int k = 1; k <= MaxIterations; k++) { + c = cp; + + HPDop.HermOp(p, mmp); + + ComplexD dc = innerProduct(p,mmp); + d = dc.real(); + alpha = c / d; + + cp = axpy_norm(r, -alpha, mmp, r); + beta = cp / c; + + x = x + alpha* p ; + p = r + beta* p ; + + std::cout << "iteration "< class CovariantLaplacianCshift : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + GridBase *grid; + GaugeField U; + RealD m2=1.0e-2; + CovariantLaplacianCshift(GaugeField &_U) : + grid(_U.Grid()), + U(_U) { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out=Zero(); + for(int mu=0;mu(U, mu); // NB: Inefficent + out = out - Gimpl::CovShiftForward(Umu,mu,in); + out = out - Gimpl::CovShiftBackward(Umu,mu,in); + out = out + 2.0*in + m2*in; + } + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + + + +int main(int argc, char ** argv) +{ + Grid_init(&argc, &argv); + + typedef LatticeColourVector Field; + + auto latt_size = GridDefaultLatt(); + auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + auto mpi_layout = GridDefaultMpi(); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridParallelRNG RNG(&Grid); RNG.SeedFixedIntegers(std::vector({45,12,81,9})); + + + LatticeGaugeField U(&Grid); + + SU::ColdConfiguration(RNG,U); + + typedef CovariantLaplacianCshift Laplacian_t; + Laplacian_t Laplacian(U); + + + ColourVector ColourKronecker; + ColourKronecker = Zero(); + ColourKronecker()()(0) = 1.0; + + Coordinate site({0,0,0,0}); // Point source at origin + + Field kronecker(&Grid); + kronecker = Zero(); + pokeSite(ColourKronecker,kronecker,site); + + Field psi(&Grid); psi=Zero(); + + HermitianLinearOperator HermOp(Laplacian); + SimpleConjugateGradient(HermOp, kronecker,psi); + + Field r(&Grid); + Laplacian.M(psi,r); + r=kronecker-r; + + std::cout << "True residual "<< norm2(r) < Date: Sun, 22 Aug 2021 18:28:39 +0100 Subject: [PATCH 195/201] Remove the file --- examples/Example_Laplacian_inverter.cc | 122 ------------------------- 1 file changed, 122 deletions(-) delete mode 100644 examples/Example_Laplacian_inverter.cc diff --git a/examples/Example_Laplacian_inverter.cc b/examples/Example_Laplacian_inverter.cc deleted file mode 100644 index 1235d2b8..00000000 --- a/examples/Example_Laplacian_inverter.cc +++ /dev/null @@ -1,122 +0,0 @@ -#include -using namespace Grid; - -// Function used for Chebyshev smearing -// -Real MomentumSmearing(Real p2) -{ - return (1 - 4.0*p2) * exp(-p2/4); -} -Real DistillationSmearing(Real p2) -{ - if ( p2 > 0.5 ) return 0.0; - else return 1.0; -} - -// Flip sign to make prop to p^2, not -p^2 relative to last example -template class CovariantLaplacianCshift : public SparseMatrixBase -{ -public: - INHERIT_GIMPL_TYPES(Gimpl); - - GridBase *grid; - GaugeField U; - - CovariantLaplacianCshift(GaugeField &_U) : - grid(_U.Grid()), - U(_U) { }; - - virtual GridBase *Grid(void) { return grid; }; - - virtual void M (const Field &in, Field &out) - { - out=Zero(); - for(int mu=0;mu(U, mu); // NB: Inefficent - out = out - Gimpl::CovShiftForward(Umu,mu,in); - out = out - Gimpl::CovShiftBackward(Umu,mu,in); - out = out + 2.0*in; - } - }; - virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian - virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid - virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid - virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid -}; - - - -int main(int argc, char ** argv) -{ - Grid_init(&argc, &argv); - - typedef LatticeColourVector Field; - - auto latt_size = GridDefaultLatt(); - auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); - auto mpi_layout = GridDefaultMpi(); - - GridCartesian Grid(latt_size,simd_layout,mpi_layout); - GridParallelRNG RNG(&Grid); RNG.SeedFixedIntegers(std::vector({45,12,81,9})); - - - LatticeGaugeField U(&Grid); - - SU::ColdConfiguration(RNG,U); - - typedef CovariantLaplacianCshift Laplacian_t; - Laplacian_t Laplacian(U); - - ColourVector ColourKronecker; - ColourKronecker = Zero(); - ColourKronecker()()(0) = 1.0; - - Coordinate site({latt_size[0]/2, - latt_size[1]/2, - latt_size[2]/2, - 0}); - - Field kronecker(&Grid); - kronecker = Zero(); - pokeSite(ColourKronecker,kronecker,site); - - - Field psi(&Grid), chi(&Grid); - - ////////////////////////////////////// - // Classic Wuppertal smearing - ////////////////////////////////////// - - Integer Iterations = 80; - Real width = 2.0; - Real coeff = (width*width) / Real(4*Iterations); - - chi=kronecker; - // chi = (1-p^2/2N)^N kronecker - for(int n = 0; n < Iterations; ++n) { - Laplacian.M(chi,psi); - chi = chi - coeff*psi; - } - - std::cout << " Wuppertal smeared operator is chi = \n" << chi < HermOp(Laplacian); - - Chebyshev ChebySmear(lo,hi,20,DistillationSmearing); - // Chebyshev ChebySmear(lo,hi,20,MomentumSmearing); - { - std::ofstream of("chebysmear"); - ChebySmear.csv(of); - } - - ChebySmear(HermOp,kronecker,chi); - - std::cout << " Chebyshev smeared operator is chi = \n" << chi < Date: Sun, 22 Aug 2021 18:40:55 +0100 Subject: [PATCH 196/201] Fail on non-apple --- Grid/util/Init.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 3b661524..c0bfc906 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -52,8 +52,8 @@ Author: paboyle #include -static int #ifdef __APPLE__ +static int feenableexcept (unsigned int excepts) { #if 0 From 5b3c530aa778b23ae9383a5b7a63fd20f5d9c2cb Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 23 Aug 2021 15:30:45 +0100 Subject: [PATCH 197/201] Return value --- Grid/util/Init.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index c0bfc906..ab2d2399 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -73,6 +73,7 @@ feenableexcept (unsigned int excepts) iold_excepts = (int) old_excepts; return ( fesetenv (&fenv) ? -1 : iold_excepts ); #endif + return 0; } #endif From 0d588b95f4e1847024c4eea0094f7fc0dc36ad28 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 23 Aug 2021 23:14:26 +0100 Subject: [PATCH 198/201] Bug fix to Example_Laplacian test --- Grid/threads/Accelerator.h | 6 ++++++ examples/Example_Laplacian.cc | 9 +++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 2c9d15ba..a8c91aa8 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -39,6 +39,10 @@ Author: paboyle #ifdef HAVE_MM_MALLOC_H #include #endif +#ifdef __APPLE__ +// no memalign +inline void *memalign(size_t align, size_t bytes) { return malloc(bytes); } +#endif NAMESPACE_BEGIN(Grid); @@ -419,6 +423,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas #undef GRID_SIMT + + #define accelerator #define accelerator_inline strong_inline #define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ }); diff --git a/examples/Example_Laplacian.cc b/examples/Example_Laplacian.cc index f6bbf7cf..fa8466cf 100644 --- a/examples/Example_Laplacian.cc +++ b/examples/Example_Laplacian.cc @@ -116,7 +116,6 @@ public: /////////////////////////////////// // Arithmetic expressions /////////////////////////////////// - StencilEntry *SE; // Views; device friendly/accessible pointers auto st = Stencil.View(AcceleratorRead); @@ -129,10 +128,11 @@ public: const int Nsimd = vobj::Nsimd(); const uint64_t NN = grid->oSites(); - const int lane=acceleratorSIMTlane(Nsimd); accelerator_for( ss, NN, Nsimd, { + StencilEntry *SE; + const int lane=acceleratorSIMTlane(Nsimd); calcObj chi; @@ -202,7 +202,6 @@ public: /////////////////////////////////// auto st = Stencil.View(AcceleratorRead); auto buf = st.CommBuf(); - StencilEntry *SE; autoView( in , _in , AcceleratorRead); autoView( out , _out , AcceleratorWrite); @@ -214,9 +213,11 @@ public: const int Nsimd = vobj::Nsimd(); const uint64_t NN = grid->oSites(); - const int lane=acceleratorSIMTlane(Nsimd); + accelerator_for( ss, NN, Nsimd, { + StencilEntry *SE; + const int lane=acceleratorSIMTlane(Nsimd); calcObj chi; From 114920b8de4048b250206b4bd3ac469cd7fffd95 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 25 Aug 2021 12:24:17 +0100 Subject: [PATCH 199/201] Some example clean up --- examples/Example_Laplacian_smearing.cc | 4 ++-- examples/Example_Laplacian_solver.cc | 14 ++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/examples/Example_Laplacian_smearing.cc b/examples/Example_Laplacian_smearing.cc index 327c5ca9..9780e8a0 100644 --- a/examples/Example_Laplacian_smearing.cc +++ b/examples/Example_Laplacian_smearing.cc @@ -112,8 +112,8 @@ int main(int argc, char ** argv) PowerMethod PM; PM(HermOp,kronecker); - Chebyshev ChebySmear(lo,hi,20,DistillationSmearing); - // Chebyshev ChebySmear(lo,hi,20,MomentumSmearing); + // Chebyshev ChebySmear(lo,hi,20,DistillationSmearing); + Chebyshev ChebySmear(lo,hi,20,MomentumSmearing); { std::ofstream of("chebysmear"); ChebySmear.csv(of); diff --git a/examples/Example_Laplacian_solver.cc b/examples/Example_Laplacian_solver.cc index 88275df8..4dc00280 100644 --- a/examples/Example_Laplacian_solver.cc +++ b/examples/Example_Laplacian_solver.cc @@ -15,8 +15,7 @@ void SimpleConjugateGradient(LinearOperatorBase &HPDop,const Field &b, Fi r = b - mmp; p = r; - alpha = norm2(p); - cp = alpha; + cp = alpha = norm2(p); ssq = norm2(b); RealD rsq = Tolerance * Tolerance * ssq; @@ -26,11 +25,12 @@ void SimpleConjugateGradient(LinearOperatorBase &HPDop,const Field &b, Fi HPDop.HermOp(p, mmp); - ComplexD dc = innerProduct(p,mmp); - d = dc.real(); + d = real(innerProduct(p,mmp)); + alpha = c / d; - cp = axpy_norm(r, -alpha, mmp, r); + r = r - alpha *mmp; + cp = norm2(r); beta = cp / c; x = x + alpha* p ; @@ -121,7 +121,9 @@ int main(int argc, char ** argv) r=kronecker-r; std::cout << "True residual "<< norm2(r) < Date: Mon, 30 Aug 2021 20:32:11 -0400 Subject: [PATCH 200/201] Some sample code --- examples/Example_Mobius_spectrum.cc | 334 ++++++++++++++++++++++++++++ 1 file changed, 334 insertions(+) create mode 100644 examples/Example_Mobius_spectrum.cc diff --git a/examples/Example_Mobius_spectrum.cc b/examples/Example_Mobius_spectrum.cc new file mode 100644 index 00000000..dd84a336 --- /dev/null +++ b/examples/Example_Mobius_spectrum.cc @@ -0,0 +1,334 @@ +/************************************************************************************* + Grid physics library, www.github.com/paboyle/Grid + + Copyright (C) 2021 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +// Flip sign to make prop to p^2, not -p^2 relative to last example +template class CovariantLaplacianCshift : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + GridBase *grid; + GaugeField U; + + CovariantLaplacianCshift(GaugeField &_U) : + grid(_U.Grid()), + U(_U) { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out=Zero(); + for(int mu=0;mu(U, mu); // NB: Inefficent + out = out - Gimpl::CovShiftForward(Umu,mu,in); + out = out - Gimpl::CovShiftBackward(Umu,mu,in); + out = out + 2.0*in; + } + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + +void MakePhase(Coordinate mom,LatticeComplex &phase) +{ + GridBase *grid = phase.Grid(); + auto latt_size = grid->GlobalDimensions(); + ComplexD ci(0.0,1.0); + phase=Zero(); + + LatticeComplex coor(phase.Grid()); + for(int mu=0;mu +void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) +{ + typedef CovariantLaplacianCshift Laplacian_t; + Laplacian_t Laplacian(U); + + Integer Iterations = 40; + Real width = 2.0; + Real coeff = (width*width) / Real(4*Iterations); + + Field tmp(U.Grid()); + smeared=unsmeared; + // chi = (1-p^2/2N)^N kronecker + for(int n = 0; n < Iterations; ++n) { + Laplacian.M(smeared,tmp); + smeared = smeared - coeff*tmp; + std::cout << " smear iter " << n<<" " < +void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) +{ + GridBase *UGrid = D.GaugeGrid(); + GridBase *FGrid = D.FermionGrid(); + + LatticeFermion src4 (UGrid); + LatticeFermion src5 (FGrid); + LatticeFermion result5(FGrid); + LatticeFermion result4(UGrid); + + ConjugateGradient CG(1.0e-8,100000); + SchurRedBlackDiagMooeeSolve schur(CG); + ZeroGuesser ZG; // Could be a DeflatedGuesser if have eigenvectors + for(int s=0;s(src4,source,s,c); + + D.ImportPhysicalFermionSource(src4,src5); + + result5=Zero(); + schur(D,src5,result5,ZG); + std::cout<(propagator,result4,s,c); + } + } +} + +class MesonFile: Serializable { +public: + GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector >, data); +}; + +void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) +{ + const int nchannel=4; + Gamma::Algebra Gammas[nchannel][2] = { + {Gamma::Algebra::Gamma5 ,Gamma::Algebra::Gamma5}, + {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5}, + {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5}, + {Gamma::Algebra::Gamma5 ,Gamma::Algebra::GammaTGamma5} + }; + + Gamma G5(Gamma::Algebra::Gamma5); + + LatticeComplex meson_CF(q1.Grid()); + MesonFile MF; + + for(int ch=0;ch meson_T; + sliceSum(meson_CF,meson_T, Tdir); + + int nt=meson_T.size(); + + std::vector corr(nt); + for(int t=0;t seeds4({1,2,3,4}); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeGaugeField Umu(UGrid); + std::string config; + if( argc > 1 && argv[1][0] != '-' ) + { + std::cout<::ColdConfiguration(Umu); + // SU::HotConfiguration(RNG4,Umu); + config="HotConfig"; + } + + std::vector masses({ 0.03,0.04,0.45} ); // u/d, s, c ?? + + int nmass = masses.size(); + + std::vector FermActs; + + std::cout< PointProps(nmass,UGrid); + std::vector GaussProps(nmass,UGrid); + std::vector Z2Props (nmass,UGrid); + + for(int m=0;m Date: Mon, 30 Aug 2021 21:15:39 -0400 Subject: [PATCH 201/201] Comment update --- examples/Example_Mobius_spectrum.cc | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/examples/Example_Mobius_spectrum.cc b/examples/Example_Mobius_spectrum.cc index dd84a336..f4cd3335 100644 --- a/examples/Example_Mobius_spectrum.cc +++ b/examples/Example_Mobius_spectrum.cc @@ -1,33 +1,13 @@ -/************************************************************************************* - Grid physics library, www.github.com/paboyle/Grid +/* + * Warning: This code illustrative only: not well tested, and not meant for production use + * without regression / tests being applied + */ - Copyright (C) 2021 - -Author: Peter Boyle - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ #include using namespace std; using namespace Grid; -// Flip sign to make prop to p^2, not -p^2 relative to last example template class CovariantLaplacianCshift : public SparseMatrixBase { public: