From f0dc0f36214c4dda1eace9d83956e5d7fef4f729 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sat, 22 Aug 2020 13:57:33 +0200 Subject: [PATCH 01/14] fix compile issue on Qpace3 --- Grid/lattice/Lattice_transfer.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index e698e40e..91de721f 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -127,6 +127,11 @@ accelerator_inline void convertType(T1 & out, const iScalar & in) { convertType(out,in._internal); } +template::value, T1>::type* = nullptr> +accelerator_inline void convertType(T1 & out, const iScalar & in) { + convertType(out,in._internal); +} + template accelerator_inline void convertType(iScalar & out, const T2 & in) { convertType(out._internal,in); From 1292d595634172e28158f99d31863d63267ca5ac Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Thu, 11 Jun 2020 13:16:00 +0200 Subject: [PATCH 02/14] Add a typedef + broaden interface of CMat --- Grid/algorithms/CoarsenedMatrix.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 8d184aea..76950baf 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -268,6 +268,21 @@ public: typedef iMatrix Cobj; typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice FineField; + typedef CoarseVector FermionField; + + // enrich interface + void Meooe(CoarseVector const& in, CoarseVector& out) { assert(0); } + void MeooeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } + void Mooee(CoarseVector const& in, CoarseVector& out) { assert(0); } + void MooeeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } + void MooeeInv(CoarseVector const& in, CoarseVector& out) { assert(0); } + void MooeeInvDag(CoarseVector const& in, CoarseVector& out) { assert(0); } + void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; } + void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; } + void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; } + void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; } + void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; }; + void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; }; //////////////////// // Data members From dd1ba266b269b093a8bcd9bf815438d9896d7148 Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Fri, 17 Jul 2020 11:58:02 +0200 Subject: [PATCH 03/14] Fix mapping between dir + disp and point in CMat --- Grid/algorithms/CoarsenedMatrix.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 76950baf..d18fba43 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -432,25 +432,25 @@ public: ////////////// // 4D action like wilson - // 0+ => 0 - // 0- => 1 - // 1+ => 2 - // 1- => 3 + // 0+ => 0 + // 0- => 4 + // 1+ => 1 + // 1- => 5 // etc.. ////////////// // 5D action like DWF - // 1+ => 0 - // 1- => 1 - // 2+ => 2 - // 2- => 3 + // 1+ => 0 + // 1- => 4 + // 2+ => 1 + // 2- => 5 // etc.. auto point = [dir, disp, ndim](){ if(dir == 0 and disp == 0) return 8; else if ( ndim==4 ) { - return (4 * dir + 1 - disp) / 2; + return (1 - disp) / 2 * 4 + dir; } else { - return (4 * (dir-1) + 1 - disp) / 2; + return (1 - disp) / 2 * 4 + dir - 1; } }(); From b2087f14c48e881ab041bbaef7f6c444c88a895d Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Mon, 24 Aug 2020 16:54:36 +0200 Subject: [PATCH 04/14] Fix CoarsenedMatrix regarding illegal memory accesses Need a reference to geom since the lambda copies the this pointer which points to host memory, see - https://docs.nvidia.com/cuda/cuda-c-programming-guide/#star-this-capture - https://devblogs.nvidia.com/new-compiler-features-cuda-8/ --- Grid/algorithms/CoarsenedMatrix.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index d18fba43..ba40535c 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -310,6 +310,8 @@ public: Stencil.HaloExchange(in,compressor); autoView( in_v , in, AcceleratorRead); autoView( out_v , out, AcceleratorWrite); + autoView( Stencil_v , Stencil, AcceleratorRead); + auto& geom_v = geom; typedef LatticeView Aview; Vector AcceleratorViewContainer; @@ -331,14 +333,14 @@ public: int ptype; StencilEntry *SE; - for(int point=0;point_is_local) { nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); } acceleratorSynchronise(); @@ -382,6 +384,7 @@ public: autoView( out_v , out, AcceleratorWrite); autoView( in_v , in, AcceleratorRead); + autoView( Stencil_v , Stencil, AcceleratorRead); const int Nsimd = CComplex::Nsimd(); typedef decltype(coalescedRead(in_v[0])) calcVector; @@ -395,12 +398,12 @@ public: int ptype; StencilEntry *SE; - SE=Stencil.GetEntry(ptype,point,ss); + SE=Stencil_v.GetEntry(ptype,point,ss); if(SE->_is_local) { nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); } acceleratorSynchronise(); From 2a75516330925bd05681f4dba639482ca84270d7 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Wed, 26 Aug 2020 12:34:17 -0400 Subject: [PATCH 05/14] state MPI/SLURM message only on world_rank zero --- Grid/threads/Accelerator.cc | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 864d90a9..f6df4a31 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -21,22 +21,26 @@ void acceleratorInit(void) #define ENV_RANK_SLURM "SLURM_PROCID" #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" #define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" - // We extract the local rank initialization using an environment variable - if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { - printf("OPENMPI detected\n"); - rank = atoi(localRankStr); - } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) { - printf("MVAPICH detected\n"); - rank = atoi(localRankStr); - } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) { - printf("SLURM detected\n"); - rank = atoi(localRankStr); - } else { - printf("MPI version is unknown - bad things may happen\n"); - } if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);} + // We extract the local rank initialization using an environment variable + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { + if (!world_rank) + printf("OPENMPI detected\n"); + rank = atoi(localRankStr); + } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) { + if (!world_rank) + printf("MVAPICH detected\n"); + rank = atoi(localRankStr); + } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) { + if (!world_rank) + printf("SLURM detected\n"); + rank = atoi(localRankStr); + } else { + if (!world_rank) + printf("MPI version is unknown - bad things may happen\n"); + } size_t totalDeviceMem=0; for (int i = 0; i < nDevices; i++) { From cf3535d16e412adf80ca1a2f7ead54003860c94b Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Thu, 27 Aug 2020 14:06:48 +0200 Subject: [PATCH 06/14] Expose more functions in CMat --- Grid/algorithms/CoarsenedMatrix.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index ba40535c..cdfc2ac3 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -271,6 +271,9 @@ public: typedef CoarseVector FermionField; // enrich interface + void Dhop(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } + void DhopEO(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } + void DhopOE(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } void Meooe(CoarseVector const& in, CoarseVector& out) { assert(0); } void MeooeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } void Mooee(CoarseVector const& in, CoarseVector& out) { assert(0); } From 4d2dc7ba0304397f536a2bc71260266e1475ae83 Mon Sep 17 00:00:00 2001 From: Daniel Richtmann Date: Mon, 7 Sep 2020 17:57:07 +0200 Subject: [PATCH 07/14] Enable even-odd for CoarsenedMatrix --- Grid/algorithms/CoarsenedMatrix.h | 471 +++++++++++++++++++++++--- tests/solver/Test_coarse_even_odd.cc | 475 +++++++++++++++++++++++++++ 2 files changed, 899 insertions(+), 47 deletions(-) create mode 100644 tests/solver/Test_coarse_even_odd.cc diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index cdfc2ac3..66b9c169 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -31,6 +31,7 @@ Author: paboyle #ifndef GRID_ALGORITHM_COARSENED_MATRIX_H #define GRID_ALGORITHM_COARSENED_MATRIX_H +#include // needed for Dagger(Yes|No), Inverse(Yes|No) NAMESPACE_BEGIN(Grid); @@ -59,12 +60,14 @@ inline void blockMaskedInnerProduct(Lattice &CoarseInner, class Geometry { public: int npoint; + int base; std::vector directions ; std::vector displacements; + std::vector points_dagger; Geometry(int _d) { - int base = (_d==5) ? 1:0; + base = (_d==5) ? 1:0; // make coarse grid stencil for 4d , not 5d if ( _d==5 ) _d=4; @@ -72,16 +75,51 @@ public: npoint = 2*_d+1; directions.resize(npoint); displacements.resize(npoint); + points_dagger.resize(npoint); for(int d=0;d<_d;d++){ directions[d ] = d+base; directions[d+_d] = d+base; displacements[d ] = +1; displacements[d+_d]= -1; + points_dagger[d ] = d+_d; + points_dagger[d+_d] = d; } directions [2*_d]=0; displacements[2*_d]=0; + points_dagger[2*_d]=2*_d; } + int point(int dir, int disp) { + assert(disp == -1 || disp == 0 || disp == 1); + assert(base+0 <= dir && dir < base+4); + + // directions faster index = new indexing + // 4d (base = 0): + // point 0 1 2 3 4 5 6 7 8 + // dir 0 1 2 3 0 1 2 3 0 + // disp +1 +1 +1 +1 -1 -1 -1 -1 0 + // 5d (base = 1): + // point 0 1 2 3 4 5 6 7 8 + // dir 1 2 3 4 1 2 3 4 0 + // disp +1 +1 +1 +1 -1 -1 -1 -1 0 + + // displacements faster index = old indexing + // 4d (base = 0): + // point 0 1 2 3 4 5 6 7 8 + // dir 0 0 1 1 2 2 3 3 0 + // disp +1 -1 +1 -1 +1 -1 +1 -1 0 + // 5d (base = 1): + // point 0 1 2 3 4 5 6 7 8 + // dir 1 1 2 2 3 3 4 4 0 + // disp +1 -1 +1 -1 +1 -1 +1 -1 0 + + if(dir == 0 and disp == 0) + return 8; + else // New indexing + return (1 - disp) / 2 * 4 + dir - base; + // else // Old indexing + // return (4 * (dir - base) + 1 - disp) / 2; + } }; template @@ -258,7 +296,7 @@ public: // Fine Object == (per site) type of fine field // nbasis == number of deflation vectors template -class CoarsenedMatrix : public SparseMatrixBase > > { +class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase > > { public: typedef iVector siteVector; @@ -270,16 +308,7 @@ public: typedef Lattice FineField; typedef CoarseVector FermionField; - // enrich interface - void Dhop(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } - void DhopEO(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } - void DhopOE(CoarseVector const& in, CoarseVector& out, int dag) { assert(0); } - void Meooe(CoarseVector const& in, CoarseVector& out) { assert(0); } - void MeooeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } - void Mooee(CoarseVector const& in, CoarseVector& out) { assert(0); } - void MooeeDag(CoarseVector const& in, CoarseVector& out) { assert(0); } - void MooeeInv(CoarseVector const& in, CoarseVector& out) { assert(0); } - void MooeeInvDag(CoarseVector const& in, CoarseVector& out) { assert(0); } + // enrich interface, use default implementation as in FermionOperator /////// void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; } void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; } void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; } @@ -292,21 +321,36 @@ public: //////////////////// Geometry geom; GridBase * _grid; + GridBase* _cbgrid; int hermitian; CartesianStencil Stencil; + CartesianStencil StencilEven; + CartesianStencil StencilOdd; std::vector A; - + std::vector Aeven; + std::vector Aodd; + + CoarseMatrix AselfInv; + CoarseMatrix AselfInvEven; + CoarseMatrix AselfInvOdd; + + Vector dag_factor; + /////////////////////// // Interface /////////////////////// GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know + GridBase * RedBlackGrid() { return _cbgrid; }; + + int ConstEE() { return 0; } void M (const CoarseVector &in, CoarseVector &out) { conformable(_grid,in.Grid()); conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); SimpleCompressor compressor; @@ -364,12 +408,72 @@ public: return M(in,out); } else { // corresponds to Galerkin coarsening - CoarseVector tmp(Grid()); - G5C(tmp, in); - M(tmp, out); - G5C(out, out); + return MdagNonHermitian(in, out); } }; + + void MdagNonHermitian(const CoarseVector &in, CoarseVector &out) + { + conformable(_grid,in.Grid()); + conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); + + SimpleCompressor compressor; + + Stencil.HaloExchange(in,compressor); + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); + autoView( Stencil_v , Stencil, AcceleratorRead); + auto& geom_v = geom; + typedef LatticeView Aview; + + Vector AcceleratorViewContainer; + + for(int p=0;poSites(); + + Vector points(geom.npoint, 0); + for(int p=0; poSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb compressor; @@ -379,6 +483,7 @@ public: { conformable(_grid,in.Grid()); conformable(_grid,out.Grid()); + out.Checkerboard() = in.Checkerboard(); typedef LatticeView Aview; Vector AcceleratorViewContainer; @@ -434,34 +539,7 @@ public: this->MdirComms(in); - int ndim = in.Grid()->Nd(); - - ////////////// - // 4D action like wilson - // 0+ => 0 - // 0- => 4 - // 1+ => 1 - // 1- => 5 - // etc.. - ////////////// - // 5D action like DWF - // 1+ => 0 - // 1- => 4 - // 2+ => 1 - // 2- => 5 - // etc.. - auto point = [dir, disp, ndim](){ - if(dir == 0 and disp == 0) - return 8; - else if ( ndim==4 ) { - return (1 - disp) / 2 * 4 + dir; - } else { - return (1 - disp) / 2 * 4 + dir - 1; - } - }(); - - MdirCalc(in,out,point); - + MdirCalc(in,out,geom.point(dir,disp)); }; void Mdiag(const CoarseVector &in, CoarseVector &out) @@ -470,17 +548,269 @@ public: MdirCalc(in, out, point); // No comms }; + void Mooee(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerNo, InverseNo); + } + + void MooeeInv(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerNo, InverseYes); + } + + void MooeeDag(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerYes, InverseNo); + } + + void MooeeInvDag(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerYes, InverseYes); + } + + void Meooe(const CoarseVector &in, CoarseVector &out) { + if(in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerNo); + } else { + DhopOE(in, out, DaggerNo); + } + } + + void MeooeDag(const CoarseVector &in, CoarseVector &out) { + if(in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerYes); + } else { + DhopOE(in, out, DaggerYes); + } + } + + void Dhop(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _grid); // verifies full grid + conformable(in.Grid(), out.Grid()); + + out.Checkerboard() = in.Checkerboard(); + + DhopInternal(Stencil, A, in, out, dag); + } + + void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Even); + out.Checkerboard() = Odd; + + DhopInternal(StencilEven, Aodd, in, out, dag); + } + + void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Odd); + out.Checkerboard() = Even; + + DhopInternal(StencilOdd, Aeven, in, out, dag); + } + + void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) { + out.Checkerboard() = in.Checkerboard(); + assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); + + CoarseMatrix *Aself = nullptr; + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1]; + DselfInternal(StencilOdd, *Aself, in, out, dag); + } else { + Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1]; + DselfInternal(StencilEven, *Aself, in, out, dag); + } + } else { + Aself = (inv) ? &AselfInv : &A[geom.npoint-1]; + DselfInternal(Stencil, *Aself, in, out, dag); + } + assert(Aself != nullptr); + } + + void DselfInternal(CartesianStencil &st, CoarseMatrix &a, + const CoarseVector &in, CoarseVector &out, int dag) { + int point = geom.npoint-1; + autoView( out_v, out, AcceleratorWrite); + autoView( in_v, in, AcceleratorRead); + autoView( st_v, st, AcceleratorRead); + autoView( a_v, a, AcceleratorRead); + + const int Nsimd = CComplex::Nsimd(); + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + + RealD* dag_factor_p = &dag_factor[0]; + + if(dag) { + accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + SE=st_v.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bboSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + SE=st_v.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb &st, std::vector &a, + const CoarseVector &in, CoarseVector &out, int dag) { + SimpleCompressor compressor; + + st.HaloExchange(in,compressor); + autoView( in_v, in, AcceleratorRead); + autoView( out_v, out, AcceleratorWrite); + autoView( st_v , st, AcceleratorRead); + typedef LatticeView Aview; + + // determine in what order we need the points + int npoint = geom.npoint-1; + Vector points(npoint, 0); + for(int p=0; p AcceleratorViewContainer; + for(int p=0;poSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bboSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb > &linop, Aggregation & Subspace) { @@ -629,6 +959,9 @@ public: std::cout << GridLogMessage << " ForceHermitian, new code "<lSites(); + + typedef typename Cobj::scalar_object scalar_object; + + autoView(Aself_v, A[geom.npoint-1], CpuRead); + autoView(AselfInv_v, AselfInv, CpuWrite); + thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke + Eigen::MatrixXcd selfLinkEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis); + Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis); + + scalar_object selfLink = Zero(); + scalar_object selfLinkInv = Zero(); + + Coordinate lcoor; + + Grid()->LocalIndexToLocalCoor(site, lcoor); + peekLocalSite(selfLink, Aself_v, lcoor); + + for (int i = 0; i < nbasis; ++i) + for (int j = 0; j < nbasis; ++j) + selfLinkEigen(i, j) = static_cast(TensorRemove(selfLink(i, j))); + + selfLinkInvEigen = selfLinkEigen.inverse(); + + for(int i = 0; i < nbasis; ++i) + for(int j = 0; j < nbasis; ++j) + selfLinkInv(i, j) = selfLinkInvEigen(i, j); + + pokeLocalSite(selfLinkInv, AselfInv_v, lcoor); + }); + } + + void FillHalfCbs() { + std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl; + for(int p = 0; p < geom.npoint; ++p) { + pickCheckerboard(Even, Aeven[p], A[p]); + pickCheckerboard(Odd, Aodd[p], A[p]); + } + pickCheckerboard(Even, AselfInvEven, AselfInv); + pickCheckerboard(Odd, AselfInvOdd, AselfInv); + } }; NAMESPACE_END(Grid); diff --git a/tests/solver/Test_coarse_even_odd.cc b/tests/solver/Test_coarse_even_odd.cc new file mode 100644 index 00000000..dfbab747 --- /dev/null +++ b/tests/solver/Test_coarse_even_odd.cc @@ -0,0 +1,475 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/solver/Test_coarse_even_odd.cc + + Copyright (C) 2015-2020 + + Author: Daniel Richtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + +#include + +using namespace Grid; + +#ifndef NBASIS +#define NBASIS 40 +#endif + +// NOTE: The tests in this file are written in analogy to +// - tests/core/Test_wilson_even_odd.cc +// - tests/core/Test_wilson_clover.cc + +std::vector readFromCommandlineIvec(int* argc, + char*** argv, + std::string&& option, + const std::vector& defaultValue) { + std::string arg; + std::vector ret(defaultValue); + if(GridCmdOptionExists(*argv, *argv + *argc, option)) { + arg = GridCmdOptionPayload(*argv, *argv + *argc, option); + GridCmdOptionIntVector(arg, ret); + } + return ret; +} + +int main(int argc, char** argv) { + Grid_init(&argc, &argv); + + ///////////////////////////////////////////////////////////////////////////// + // Read from command line // + ///////////////////////////////////////////////////////////////////////////// + + const int nbasis = NBASIS; static_assert((nbasis & 0x1) == 0, ""); + const int nb = nbasis/2; + Coordinate blockSize = readFromCommandlineIvec(&argc, &argv, "--blocksize", {2, 2, 2, 2}); + + std::cout << GridLogMessage << "Compiled with nbasis = " << nbasis << " -> nb = " << nb << std::endl; + + ///////////////////////////////////////////////////////////////////////////// + // General setup // + ///////////////////////////////////////////////////////////////////////////// + + Coordinate clatt = GridDefaultLatt(); + for(int d=0; dshow_decomposition(); + std::cout << GridLogMessage << "Grid_c:" << std::endl; Grid_c->show_decomposition(); + std::cout << GridLogMessage << "RBGrid_f:" << std::endl; RBGrid_f->show_decomposition(); + std::cout << GridLogMessage << "RBGrid_c:" << std::endl; RBGrid_c->show_decomposition(); + + GridParallelRNG pRNG_f(Grid_f); + GridParallelRNG pRNG_c(Grid_c); + + std::vector seeds({1, 2, 3, 4}); + + pRNG_f.SeedFixedIntegers(seeds); + pRNG_c.SeedFixedIntegers(seeds); + + ///////////////////////////////////////////////////////////////////////////// + // Setup of Dirac Matrix and Operator // + ///////////////////////////////////////////////////////////////////////////// + + LatticeGaugeField Umu(Grid_f); SU3::HotConfiguration(pRNG_f, Umu); + + RealD checkTolerance = (getPrecision::value == 1) ? 1e-7 : 1e-15; + + RealD mass = -0.30; + RealD csw = 1.9192; + + WilsonCloverFermionR Dwc(Umu, *Grid_f, *RBGrid_f, mass, csw, csw); + MdagMLinearOperator MdagMOp_Dwc(Dwc); + + ///////////////////////////////////////////////////////////////////////////// + // Type definitions // + ///////////////////////////////////////////////////////////////////////////// + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseDiracMatrix; + typedef CoarseDiracMatrix::CoarseVector CoarseVector; + + ///////////////////////////////////////////////////////////////////////////// + // Setup of Aggregation // + ///////////////////////////////////////////////////////////////////////////// + + Aggregates Aggs(Grid_c, Grid_f, 0); + { + LatticeFermion tmp(Aggs.subspace[0].Grid()); + for(int n = 0; n < nb; n++) { + gaussian(pRNG_f, Aggs.subspace[n]); + G5C(tmp, Aggs.subspace[n]); + axpby(Aggs.subspace[n + nb], 0.5, -0.5, Aggs.subspace[n], tmp); + axpby(Aggs.subspace[n], 0.5, 0.5, Aggs.subspace[n], tmp); + } + } + + ///////////////////////////////////////////////////////////////////////////// + // Setup of CoarsenedMatrix and Operator // + ///////////////////////////////////////////////////////////////////////////// + + const int hermitian = 0; + CoarseDiracMatrix Dc(*Grid_c, *RBGrid_c, hermitian); + Dc.CoarsenOperator(Grid_f, MdagMOp_Dwc, Aggs); + MdagMLinearOperator MdagMOp_Dc(Dc); + + ///////////////////////////////////////////////////////////////////////////// + // Setup vectors used in all tests // + ///////////////////////////////////////////////////////////////////////////// + + CoarseVector src(Grid_c); random(pRNG_c, src); + CoarseVector diff(Grid_c); diff = Zero(); + + ///////////////////////////////////////////////////////////////////////////// + // Start of tests // + ///////////////////////////////////////////////////////////////////////////// + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Dhop + Mdiag = Munprec" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi(Grid_c); chi = Zero(); + CoarseVector res(Grid_c); res = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + + Dc.Mdiag(src, phi); std::cout << GridLogMessage << "Applied Mdiag" << std::endl; + Dc.Dhop(src, chi, DaggerNo); std::cout << GridLogMessage << "Applied Dhop" << std::endl; + Dc.M(src, ref); std::cout << GridLogMessage << "Applied M" << std::endl; + + res = phi + chi; + + diff = ref - res; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Munprec), norm2(Dhop + Mdiag), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(res) << " " << absDev << " " << relDev << " -> check " + << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Meo + Moe = Dhop" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector src_e(RBGrid_c); src_e = Zero(); + CoarseVector src_o(RBGrid_c); src_o = Zero(); + CoarseVector res_e(RBGrid_c); res_e = Zero(); + CoarseVector res_o(RBGrid_c); res_o = Zero(); + CoarseVector res(Grid_c); res = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd, src_o, src); + + Dc.Meooe(src_e, res_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + Dc.Meooe(src_o, res_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.Dhop(src, ref, DaggerNo); std::cout << GridLogMessage << "Applied Dhop" << std::endl; + + setCheckerboard(res, res_o); + setCheckerboard(res, res_e); + + diff = ref - res; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Dhop), norm2(Meo + Moe), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(res) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test |(Im(v^dag M^dag M v)| = 0" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.M(src, tmp); std::cout << GridLogMessage << "Applied M" << std::endl; + Dc.Mdag(tmp, phi); std::cout << GridLogMessage << "Applied Mdag" << std::endl; + + std::cout << GridLogMessage << "src = " << norm2(src) << " tmp = " << norm2(tmp) << " phi = " << norm2(phi) << std::endl; + + ComplexD dot = innerProduct(src, phi); + + auto relDev = abs(imag(dot)) / abs(real(dot)); + std::cout << GridLogMessage << "Re(v^dag M^dag M v), Im(v^dag M^dag M v), rel.deviation: " + << real(dot) << " " << imag(dot) << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test |(Im(v^dag Mooee^dag Mooee v)| = 0 (full grid)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.Mooee(src, tmp); std::cout << GridLogMessage << "Applied Mooee" << std::endl; + Dc.MooeeDag(tmp, phi); std::cout << GridLogMessage << "Applied MooeeDag" << std::endl; + + ComplexD dot = innerProduct(src, phi); + + auto relDev = abs(imag(dot)) / abs(real(dot)); + std::cout << GridLogMessage << "Re(v^dag Mooee^dag Mooee v), Im(v^dag Mooee^dag Mooee v), rel.deviation: " + << real(dot) << " " << imag(dot) << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeInv Mooee = 1 (full grid)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.Mooee(src, tmp); std::cout << GridLogMessage << "Applied Mooee" << std::endl; + Dc.MooeeInv(tmp, phi); std::cout << GridLogMessage << "Applied MooeeInv" << std::endl; + + diff = src - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(src); + std::cout << GridLogMessage << "norm2(src), norm2(MooeeInv Mooee src), abs. deviation, rel. deviation: " + << norm2(src) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MeooeDagger is the dagger of Meooe by requiring" << std::endl; + std::cout << GridLogMessage << "= < phi | Meooe | chi > * = < chi | Meooe^dag| phi>" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + // clang-format off + CoarseVector phi(Grid_c); random(pRNG_c, phi); + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector dchi_e(RBGrid_c); dchi_e = Zero(); + CoarseVector dchi_o(RBGrid_c); dchi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector dphi_e(RBGrid_c); dphi_e = Zero(); + CoarseVector dphi_o(RBGrid_c); dphi_o = Zero(); + // clang-format on + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + + Dc.Meooe(chi_e, dchi_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + Dc.Meooe(chi_o, dchi_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.MeooeDag(phi_e, dphi_o); std::cout << GridLogMessage << "Applied MeoDag" << std::endl; + Dc.MeooeDag(phi_o, dphi_e); std::cout << GridLogMessage << "Applied MoeDag" << std::endl; + + ComplexD phiDchi_e = innerProduct(phi_e, dchi_e); + ComplexD phiDchi_o = innerProduct(phi_o, dchi_o); + ComplexD chiDphi_e = innerProduct(chi_e, dphi_e); + ComplexD chiDphi_o = innerProduct(chi_o, dphi_o); + + std::cout << GridLogDebug << "norm dchi_e = " << norm2(dchi_e) << " norm dchi_o = " << norm2(dchi_o) << " norm dphi_e = " << norm2(dphi_e) + << " norm dphi_o = " << norm2(dphi_e) << std::endl; + + std::cout << GridLogMessage << "e " << phiDchi_e << " " << chiDphi_e << std::endl; + std::cout << GridLogMessage << "o " << phiDchi_o << " " << chiDphi_o << std::endl; + + std::cout << GridLogMessage << "phiDchi_e - conj(chiDphi_o) " << phiDchi_e - conj(chiDphi_o) << std::endl; + std::cout << GridLogMessage << "phiDchi_o - conj(chiDphi_e) " << phiDchi_o - conj(chiDphi_e) << std::endl; + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeInv Mooee = 1 (checkerboards separately)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector tmp_e(RBGrid_c); tmp_e = Zero(); + CoarseVector tmp_o(RBGrid_c); tmp_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, tmp_e, tmp); + pickCheckerboard(Odd, tmp_o, tmp); + + Dc.Mooee(chi_e, tmp_e); std::cout << GridLogMessage << "Applied Mee" << std::endl; + Dc.MooeeInv(tmp_e, phi_e); std::cout << GridLogMessage << "Applied MeeInv" << std::endl; + Dc.Mooee(chi_o, tmp_o); std::cout << GridLogMessage << "Applied Moo" << std::endl; + Dc.MooeeInv(tmp_o, phi_o); std::cout << GridLogMessage << "Applied MooInv" << std::endl; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + diff = chi - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(chi); + std::cout << GridLogMessage << "norm2(chi), norm2(MeeInv Mee chi), abs. deviation, rel. deviation: " + << norm2(chi) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeDag MooeeInvDag = 1 (checkerboards separately)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector tmp_e(RBGrid_c); tmp_e = Zero(); + CoarseVector tmp_o(RBGrid_c); tmp_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, tmp_e, tmp); + pickCheckerboard(Odd, tmp_o, tmp); + + Dc.MooeeDag(chi_e, tmp_e); std::cout << GridLogMessage << "Applied MeeDag" << std::endl; + Dc.MooeeInvDag(tmp_e, phi_e); std::cout << GridLogMessage << "Applied MeeInvDag" << std::endl; + Dc.MooeeDag(chi_o, tmp_o); std::cout << GridLogMessage << "Applied MooDag" << std::endl; + Dc.MooeeInvDag(tmp_o, phi_o); std::cout << GridLogMessage << "Applied MooInvDag" << std::endl; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + diff = chi - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(chi); + std::cout << GridLogMessage << "norm2(chi), norm2(MeeDag MeeInvDag chi), abs. deviation, rel. deviation: " + << norm2(chi) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Meo + Moe + Moo + Mee = Munprec" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); chi = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + CoarseVector src_e(RBGrid_c); src_e = Zero(); + CoarseVector src_o(RBGrid_c); src_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd, src_o, src); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + + // M phi = (Mooee src_e + Meooe src_o , Mooee src_o + Meooe src_e) + + Dc.M(src, ref); // Reference result from the unpreconditioned operator + + // EO matrix + Dc.Mooee(src_e, chi_e); std::cout << GridLogMessage << "Applied Mee" << std::endl; + Dc.Mooee(src_o, chi_o); std::cout << GridLogMessage << "Applied Moo" << std::endl; + Dc.Meooe(src_o, phi_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.Meooe(src_e, phi_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + + phi_o += chi_o; + phi_e += chi_e; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + std::cout << GridLogDebug << "norm phi_e = " << norm2(phi_e) << " norm phi_o = " << norm2(phi_o) << " norm phi = " << norm2(phi) << std::endl; + + diff = ref - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Dunprec), norm2(Deoprec), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MpcDagMpc is hermitian" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector phi(Grid_c); random(pRNG_c, phi); + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector dchi_e(RBGrid_c); dchi_e = Zero(); + CoarseVector dchi_o(RBGrid_c); dchi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector dphi_e(RBGrid_c); dphi_e = Zero(); + CoarseVector dphi_o(RBGrid_c); dphi_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + + SchurDiagMooeeOperator HermOpEO(Dc); + + HermOpEO.MpcDagMpc(chi_e, dchi_e); std::cout << GridLogMessage << "Applied MpcDagMpc to chi_e" << std::endl; + HermOpEO.MpcDagMpc(chi_o, dchi_o); std::cout << GridLogMessage << "Applied MpcDagMpc to chi_o" << std::endl; + HermOpEO.MpcDagMpc(phi_e, dphi_e); std::cout << GridLogMessage << "Applied MpcDagMpc to phi_e" << std::endl; + HermOpEO.MpcDagMpc(phi_o, dphi_o); std::cout << GridLogMessage << "Applied MpcDagMpc to phi_o" << std::endl; + + ComplexD phiDchi_e = innerProduct(phi_e, dchi_e); + ComplexD phiDchi_o = innerProduct(phi_o, dchi_o); + ComplexD chiDphi_e = innerProduct(chi_e, dphi_e); + ComplexD chiDphi_o = innerProduct(chi_o, dphi_o); + + std::cout << GridLogMessage << "e " << phiDchi_e << " " << chiDphi_e << std::endl; + std::cout << GridLogMessage << "o " << phiDchi_o << " " << chiDphi_o << std::endl; + + std::cout << GridLogMessage << "phiDchi_e - conj(chiDphi_e) " << phiDchi_e - conj(chiDphi_e) << std::endl; + std::cout << GridLogMessage << "phiDchi_o - conj(chiDphi_o) " << phiDchi_o - conj(chiDphi_o) << std::endl; + } + + Grid_finalize(); +} From 01652d8cfea1ad463b037b20aabfc4f3b0dc7d37 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 13 Sep 2020 05:56:02 -0400 Subject: [PATCH 08/14] SlabAllocator --- Grid/threads/Accelerator.h | 13 +-- Grid/threads/SlabAllocator.cc | 161 ++++++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+), 11 deletions(-) create mode 100644 Grid/threads/SlabAllocator.cc diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 1a3dfdc2..89d45a17 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -153,18 +153,7 @@ inline void *acceleratorAllocShared(size_t bytes) } return ptr; }; -inline void *acceleratorAllocDevice(size_t bytes) -{ - void *ptr=NULL; - auto err = cudaMalloc((void **)&ptr,bytes); - if( err != cudaSuccess ) { - ptr = (void *) NULL; - printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); - } - return ptr; -}; inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; -inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} inline int acceleratorIsCommunicable(void *ptr) @@ -176,6 +165,8 @@ inline int acceleratorIsCommunicable(void *ptr) if(uvm) return 0; else return 1; } +void *acceleratorAllocDevice(size_t bytes); +void acceleratorFreeDevice(void* ptr); #endif diff --git a/Grid/threads/SlabAllocator.cc b/Grid/threads/SlabAllocator.cc new file mode 100644 index 00000000..da863687 --- /dev/null +++ b/Grid/threads/SlabAllocator.cc @@ -0,0 +1,161 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./Grid/threads/SlabAllocator.cc + + Copyright (C) 2020 + +Author: Christoph Lehner + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#include + +#include + +NAMESPACE_BEGIN(Grid); + +#ifdef GRID_CUDA + +#define GRID_DEVICE_HEAP_SLAB_THRESHOLD (1024*1024) +#define GRID_DEVICE_HEAP_SLAB_SIZE (2*1024*1024) + +void *acceleratorAllocDeviceCUDA(size_t bytes) { + void *ptr=NULL; + auto err = cudaMalloc((void **)&ptr,bytes); + if( err != cudaSuccess ) { + ptr = (void *) NULL; + printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); + } + return ptr; +} + +void acceleratorFreeDeviceCUDA(void *ptr) { + cudaFree(ptr); +} + +struct grid_device_heap_slab_t { + void* Ptr; + size_t ElementSize; + size_t Elements; + std::unordered_set Allocated; + std::unordered_set Available; +}; + +std::unordered_map DeviceHeapPtrTable; +std::unordered_map > DeviceHeapSlabTable; + +void* SlabAllocateElement(grid_device_heap_slab_t* slab) { + assert(!slab->Available.empty()); + auto available = slab->Available.begin(); + auto slot = *available; + slab->Allocated.insert(slot); + slab->Available.erase(available); + + void* Ptr = (void*)((char*)slab->Ptr + slot * slab->ElementSize); + DeviceHeapPtrTable[Ptr] = slab; + + //std::cout << "Allocate element " << slot << " of slab " << slab << " of size " << slab->ElementSize << " with elements " << slab->Elements << + // " (allocated = " << slab->Allocated.size() << ", available = " << slab->Available.size() << ")" << std::endl; + + return Ptr; +} + +void SlabRemove(grid_device_heap_slab_t* slab) { + auto & t = DeviceHeapSlabTable[slab->ElementSize]; + assert(slab->Ptr); + DeviceHeapPtrTable.erase(slab->Ptr); + acceleratorFreeDeviceCUDA(slab->Ptr); + assert(t.count(slab) == 1); + t.erase(slab); + delete slab; + //std::cout << "Remove slab " << slab << std::endl; +} + +void SlabFreeElement(grid_device_heap_slab_t* slab, void* ElementPtr) { + size_t Offset = (size_t)ElementPtr - (size_t)slab->Ptr; + //std::cout << "SlabFreeElement offset " << Offset << std::endl; + assert(Offset < GRID_DEVICE_HEAP_SLAB_SIZE); + assert(Offset % slab->ElementSize == 0); + size_t slot = Offset / slab->ElementSize; + assert(slot >= 0); + assert(slab->Allocated.count(slot) == 1 && slab->Available.count(slot) == 0); + slab->Allocated.erase(slot); + slab->Available.insert(slot); + + //std::cout << "Free element " << slot << " of slab" << slab << std::endl; + + if (slab->Allocated.empty()) { + SlabRemove(slab); + } +} + +grid_device_heap_slab_t* SlabFind(size_t bytes) { + + grid_device_heap_slab_t* slab = 0; + std::unordered_set* slab_set = 0; + + decltype(DeviceHeapSlabTable.begin()) slabs = DeviceHeapSlabTable.find(bytes); + if (slabs == DeviceHeapSlabTable.end()) { + slab_set = &DeviceHeapSlabTable[bytes]; + } else { + slab_set = &slabs->second; + } + + for (auto& s : *slab_set) { + if (!s->Available.empty()) { + slab = &(*s); + break; + } + } + + if (!slab) { + slab = new grid_device_heap_slab_t; + slab_set->insert(slab); + slab->Ptr = acceleratorAllocDeviceCUDA(GRID_DEVICE_HEAP_SLAB_SIZE); + slab->ElementSize = bytes; + slab->Elements = GRID_DEVICE_HEAP_SLAB_SIZE / bytes; + for (size_t i=0;iElements;i++) + slab->Available.insert(i); + //std::cout << "New slab" << slab << std::endl; + } + + return slab; +} + +void *acceleratorAllocDevice(size_t bytes) { + if (bytes >= GRID_DEVICE_HEAP_SLAB_THRESHOLD) { + return acceleratorAllocDeviceCUDA(bytes); + } + + return SlabAllocateElement(SlabFind(bytes)); +} + +void acceleratorFreeDevice(void *ptr) { + auto p = DeviceHeapPtrTable.find(ptr); + if (p == DeviceHeapPtrTable.end()) { + acceleratorFreeDeviceCUDA(ptr); + } else { + SlabFreeElement(p->second,ptr); + } +} + +#endif + +NAMESPACE_END(Grid); From 32ff766dbd8416b45ae042463c8f65145b75806f Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 13 Sep 2020 14:02:53 -0400 Subject: [PATCH 09/14] fix evict scheme, slab alloc --- Grid/allocator/MemoryManagerCache.cc | 6 ++++-- Grid/threads/SlabAllocator.cc | 8 ++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 5dd7575e..7d4581d7 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -227,12 +227,13 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,hint); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; + if (!AccCache.AccPtr) + EvictVictims(bytes); assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); @@ -361,12 +362,13 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,transient); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; + if (!AccCache.AccPtr) + EvictVictims(bytes); assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache.accLock==0); // Programming error diff --git a/Grid/threads/SlabAllocator.cc b/Grid/threads/SlabAllocator.cc index da863687..5590f835 100644 --- a/Grid/threads/SlabAllocator.cc +++ b/Grid/threads/SlabAllocator.cc @@ -36,6 +36,9 @@ NAMESPACE_BEGIN(Grid); #define GRID_DEVICE_HEAP_SLAB_THRESHOLD (1024*1024) #define GRID_DEVICE_HEAP_SLAB_SIZE (2*1024*1024) +size_t currentDeviceAlloc = 0; +std::unordered_map ptr_size; + void *acceleratorAllocDeviceCUDA(size_t bytes) { void *ptr=NULL; auto err = cudaMalloc((void **)&ptr,bytes); @@ -43,11 +46,16 @@ void *acceleratorAllocDeviceCUDA(size_t bytes) { ptr = (void *) NULL; printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); } + currentDeviceAlloc += bytes; + ptr_size[ptr] = bytes; + std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl; return ptr; } void acceleratorFreeDeviceCUDA(void *ptr) { cudaFree(ptr); + currentDeviceAlloc -= ptr_size[ptr]; + std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl; } struct grid_device_heap_slab_t { From d50a2164d73d2c01048cd1f25f765a720472b2c6 Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 13 Sep 2020 14:06:06 -0400 Subject: [PATCH 10/14] remove slab allocator --- Grid/threads/Accelerator.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index 89d45a17..1a3dfdc2 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -153,7 +153,18 @@ inline void *acceleratorAllocShared(size_t bytes) } return ptr; }; +inline void *acceleratorAllocDevice(size_t bytes) +{ + void *ptr=NULL; + auto err = cudaMalloc((void **)&ptr,bytes); + if( err != cudaSuccess ) { + ptr = (void *) NULL; + printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); + } + return ptr; +}; inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);}; +inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} inline int acceleratorIsCommunicable(void *ptr) @@ -165,8 +176,6 @@ inline int acceleratorIsCommunicable(void *ptr) if(uvm) return 0; else return 1; } -void *acceleratorAllocDevice(size_t bytes); -void acceleratorFreeDevice(void* ptr); #endif From 5cffa05c7e2965216200e7fff3183fce3f15c8bb Mon Sep 17 00:00:00 2001 From: Christoph Lehner Date: Sun, 13 Sep 2020 14:06:25 -0400 Subject: [PATCH 11/14] remove slab allocator file --- Grid/threads/SlabAllocator.cc | 169 ---------------------------------- 1 file changed, 169 deletions(-) delete mode 100644 Grid/threads/SlabAllocator.cc diff --git a/Grid/threads/SlabAllocator.cc b/Grid/threads/SlabAllocator.cc deleted file mode 100644 index 5590f835..00000000 --- a/Grid/threads/SlabAllocator.cc +++ /dev/null @@ -1,169 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./Grid/threads/SlabAllocator.cc - - Copyright (C) 2020 - -Author: Christoph Lehner - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#include - -#include - -NAMESPACE_BEGIN(Grid); - -#ifdef GRID_CUDA - -#define GRID_DEVICE_HEAP_SLAB_THRESHOLD (1024*1024) -#define GRID_DEVICE_HEAP_SLAB_SIZE (2*1024*1024) - -size_t currentDeviceAlloc = 0; -std::unordered_map ptr_size; - -void *acceleratorAllocDeviceCUDA(size_t bytes) { - void *ptr=NULL; - auto err = cudaMalloc((void **)&ptr,bytes); - if( err != cudaSuccess ) { - ptr = (void *) NULL; - printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err)); - } - currentDeviceAlloc += bytes; - ptr_size[ptr] = bytes; - std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl; - return ptr; -} - -void acceleratorFreeDeviceCUDA(void *ptr) { - cudaFree(ptr); - currentDeviceAlloc -= ptr_size[ptr]; - std::cout << "Current device alloc: " << currentDeviceAlloc << std::endl; -} - -struct grid_device_heap_slab_t { - void* Ptr; - size_t ElementSize; - size_t Elements; - std::unordered_set Allocated; - std::unordered_set Available; -}; - -std::unordered_map DeviceHeapPtrTable; -std::unordered_map > DeviceHeapSlabTable; - -void* SlabAllocateElement(grid_device_heap_slab_t* slab) { - assert(!slab->Available.empty()); - auto available = slab->Available.begin(); - auto slot = *available; - slab->Allocated.insert(slot); - slab->Available.erase(available); - - void* Ptr = (void*)((char*)slab->Ptr + slot * slab->ElementSize); - DeviceHeapPtrTable[Ptr] = slab; - - //std::cout << "Allocate element " << slot << " of slab " << slab << " of size " << slab->ElementSize << " with elements " << slab->Elements << - // " (allocated = " << slab->Allocated.size() << ", available = " << slab->Available.size() << ")" << std::endl; - - return Ptr; -} - -void SlabRemove(grid_device_heap_slab_t* slab) { - auto & t = DeviceHeapSlabTable[slab->ElementSize]; - assert(slab->Ptr); - DeviceHeapPtrTable.erase(slab->Ptr); - acceleratorFreeDeviceCUDA(slab->Ptr); - assert(t.count(slab) == 1); - t.erase(slab); - delete slab; - //std::cout << "Remove slab " << slab << std::endl; -} - -void SlabFreeElement(grid_device_heap_slab_t* slab, void* ElementPtr) { - size_t Offset = (size_t)ElementPtr - (size_t)slab->Ptr; - //std::cout << "SlabFreeElement offset " << Offset << std::endl; - assert(Offset < GRID_DEVICE_HEAP_SLAB_SIZE); - assert(Offset % slab->ElementSize == 0); - size_t slot = Offset / slab->ElementSize; - assert(slot >= 0); - assert(slab->Allocated.count(slot) == 1 && slab->Available.count(slot) == 0); - slab->Allocated.erase(slot); - slab->Available.insert(slot); - - //std::cout << "Free element " << slot << " of slab" << slab << std::endl; - - if (slab->Allocated.empty()) { - SlabRemove(slab); - } -} - -grid_device_heap_slab_t* SlabFind(size_t bytes) { - - grid_device_heap_slab_t* slab = 0; - std::unordered_set* slab_set = 0; - - decltype(DeviceHeapSlabTable.begin()) slabs = DeviceHeapSlabTable.find(bytes); - if (slabs == DeviceHeapSlabTable.end()) { - slab_set = &DeviceHeapSlabTable[bytes]; - } else { - slab_set = &slabs->second; - } - - for (auto& s : *slab_set) { - if (!s->Available.empty()) { - slab = &(*s); - break; - } - } - - if (!slab) { - slab = new grid_device_heap_slab_t; - slab_set->insert(slab); - slab->Ptr = acceleratorAllocDeviceCUDA(GRID_DEVICE_HEAP_SLAB_SIZE); - slab->ElementSize = bytes; - slab->Elements = GRID_DEVICE_HEAP_SLAB_SIZE / bytes; - for (size_t i=0;iElements;i++) - slab->Available.insert(i); - //std::cout << "New slab" << slab << std::endl; - } - - return slab; -} - -void *acceleratorAllocDevice(size_t bytes) { - if (bytes >= GRID_DEVICE_HEAP_SLAB_THRESHOLD) { - return acceleratorAllocDeviceCUDA(bytes); - } - - return SlabAllocateElement(SlabFind(bytes)); -} - -void acceleratorFreeDevice(void *ptr) { - auto p = DeviceHeapPtrTable.find(ptr); - if (p == DeviceHeapPtrTable.end()) { - acceleratorFreeDeviceCUDA(ptr); - } else { - SlabFreeElement(p->second,ptr); - } -} - -#endif - -NAMESPACE_END(Grid); From d4861a362ccaca6bbf300da450f46c10ea40ed29 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 23 Nov 2020 15:38:49 +0000 Subject: [PATCH 12/14] Stencil use non-UVM memory for look up table on enable-shared=no --- Grid/stencil/Stencil.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 1e198972..23fc8203 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -269,7 +269,7 @@ public: std::vector > > face_table ; Vector surface_list; - Vector _entries; // Resident in managed memory + stencilVector _entries; // Resident in managed memory std::vector Packets; std::vector Mergers; std::vector MergersSHM; From 683a5e5bf556f97824fd242f76d0a81ab8dd7bd8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 23 Nov 2020 15:39:51 +0000 Subject: [PATCH 13/14] Stencil use host vector for integera table on enable-shared=no and mirror it on device --- Grid/allocator/AlignedAllocator.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 4b357523..91622789 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -173,7 +173,8 @@ template using cshiftAllocator = devAllocator; template using cshiftAllocator = std::allocator; #endif -template using Vector = std::vector >; +template using Vector = std::vector >; +template using stencilVector = std::vector >; template using commVector = std::vector >; template using cshiftVector = std::vector >; From 97e264d0ff6562008be60ee4f3ba355198c7f247 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Mon, 23 Nov 2020 15:46:11 +0000 Subject: [PATCH 14/14] Christoph's changes --- Grid/allocator/MemoryManagerCache.cc | 36 +++++++++++++++++----------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 5dd7575e..50076eba 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -1,11 +1,12 @@ #include - #ifndef GRID_UVM #warning "Using explicit device memory copies" NAMESPACE_BEGIN(Grid); +//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout); #define dprintf(...) + //////////////////////////////////////////////////////////// // For caching copies of data on device //////////////////////////////////////////////////////////// @@ -103,7 +104,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - // dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); assert(AccCache.CpuPtr!=(uint64_t)NULL); @@ -111,7 +112,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - // dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); + dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -125,7 +126,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - // dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); if(AccCache.state==AccDirty) { @@ -136,7 +137,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - // dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); + dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -149,7 +150,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); - // dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -164,7 +165,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - // dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; @@ -227,18 +228,24 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,hint); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - + if (!AccCache.AccPtr) + EvictVictims(bytes); + assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { + dprintf("ViewOpen found entry %llx %llx : %lld %lld\n", + (uint64_t)AccCache.CpuPtr, + (uint64_t)CpuPtr, + (uint64_t)AccCache.bytes, + (uint64_t)bytes); assert(AccCache.CpuPtr == CpuPtr); assert(AccCache.bytes ==bytes); } @@ -285,21 +292,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // CpuDirty + AccRead => Consistent } AccCache.accLock++; - // printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); } else if(AccCache.state==Consistent) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty else AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.accLock++; - // printf("Consistent entry into device accLock %d\n",AccCache.accLock); + dprintf("Consistent entry into device accLock %d\n",AccCache.accLock); } else if(AccCache.state==AccDirty) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty else AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.accLock++; - // printf("AccDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock); } else { assert(0); } @@ -361,13 +368,14 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,transient); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - + if (!AccCache.AccPtr) + EvictVictims(bytes); + assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache.accLock==0); // Programming error