diff --git a/Grid/DisableWarnings.h b/Grid/DisableWarnings.h index 8ea219fb..4bd1edd0 100644 --- a/Grid/DisableWarnings.h +++ b/Grid/DisableWarnings.h @@ -37,7 +37,9 @@ directory #endif //disables and intel compiler specific warning (in json.hpp) +#ifdef __ICC #pragma warning disable 488 +#endif #ifdef __NVCC__ //disables nvcc specific warning in json.hpp diff --git a/Grid/GridStd.h b/Grid/GridStd.h index ecb561ea..28f6bc46 100644 --- a/Grid/GridStd.h +++ b/Grid/GridStd.h @@ -28,4 +28,7 @@ /////////////////// #include "Config.h" +#ifdef TOFU +#undef GRID_COMMS_THREADS +#endif #endif /* GRID_STD_H */ diff --git a/Grid/Makefile.am b/Grid/Makefile.am index f1fa462e..ded6d146 100644 --- a/Grid/Makefile.am +++ b/Grid/Makefile.am @@ -21,6 +21,7 @@ if BUILD_HDF5 extra_headers+=serialisation/Hdf5Type.h endif + all: version-cache Version.h version-cache: @@ -53,6 +54,17 @@ Version.h: version-cache include Make.inc include Eigen.inc +extra_sources+=$(ZWILS_FERMION_FILES) +extra_sources+=$(WILS_FERMION_FILES) +extra_sources+=$(STAG_FERMION_FILES) +if BUILD_GPARITY + extra_sources+=$(GP_FERMION_FILES) +endif +if BUILD_FERMION_REPS + extra_sources+=$(ADJ_FERMION_FILES) + extra_sources+=$(TWOIND_FERMION_FILES) +endif + lib_LIBRARIES = libGrid.a CCFILES += $(extra_sources) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 8d184aea..66b9c169 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -31,6 +31,7 @@ Author: paboyle #ifndef GRID_ALGORITHM_COARSENED_MATRIX_H #define GRID_ALGORITHM_COARSENED_MATRIX_H +#include // needed for Dagger(Yes|No), Inverse(Yes|No) NAMESPACE_BEGIN(Grid); @@ -59,12 +60,14 @@ inline void blockMaskedInnerProduct(Lattice &CoarseInner, class Geometry { public: int npoint; + int base; std::vector directions ; std::vector displacements; + std::vector points_dagger; Geometry(int _d) { - int base = (_d==5) ? 1:0; + base = (_d==5) ? 1:0; // make coarse grid stencil for 4d , not 5d if ( _d==5 ) _d=4; @@ -72,16 +75,51 @@ public: npoint = 2*_d+1; directions.resize(npoint); displacements.resize(npoint); + points_dagger.resize(npoint); for(int d=0;d<_d;d++){ directions[d ] = d+base; directions[d+_d] = d+base; displacements[d ] = +1; displacements[d+_d]= -1; + points_dagger[d ] = d+_d; + points_dagger[d+_d] = d; } directions [2*_d]=0; displacements[2*_d]=0; + points_dagger[2*_d]=2*_d; } + int point(int dir, int disp) { + assert(disp == -1 || disp == 0 || disp == 1); + assert(base+0 <= dir && dir < base+4); + + // directions faster index = new indexing + // 4d (base = 0): + // point 0 1 2 3 4 5 6 7 8 + // dir 0 1 2 3 0 1 2 3 0 + // disp +1 +1 +1 +1 -1 -1 -1 -1 0 + // 5d (base = 1): + // point 0 1 2 3 4 5 6 7 8 + // dir 1 2 3 4 1 2 3 4 0 + // disp +1 +1 +1 +1 -1 -1 -1 -1 0 + + // displacements faster index = old indexing + // 4d (base = 0): + // point 0 1 2 3 4 5 6 7 8 + // dir 0 0 1 1 2 2 3 3 0 + // disp +1 -1 +1 -1 +1 -1 +1 -1 0 + // 5d (base = 1): + // point 0 1 2 3 4 5 6 7 8 + // dir 1 1 2 2 3 3 4 4 0 + // disp +1 -1 +1 -1 +1 -1 +1 -1 0 + + if(dir == 0 and disp == 0) + return 8; + else // New indexing + return (1 - disp) / 2 * 4 + dir - base; + // else // Old indexing + // return (4 * (dir - base) + 1 - disp) / 2; + } }; template @@ -258,7 +296,7 @@ public: // Fine Object == (per site) type of fine field // nbasis == number of deflation vectors template -class CoarsenedMatrix : public SparseMatrixBase > > { +class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase > > { public: typedef iVector siteVector; @@ -268,33 +306,59 @@ public: typedef iMatrix Cobj; typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice FineField; + typedef CoarseVector FermionField; + + // enrich interface, use default implementation as in FermionOperator /////// + void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; } + void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; } + void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; } + void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; } + void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; }; + void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; }; //////////////////// // Data members //////////////////// Geometry geom; GridBase * _grid; + GridBase* _cbgrid; int hermitian; CartesianStencil Stencil; + CartesianStencil StencilEven; + CartesianStencil StencilOdd; std::vector A; - + std::vector Aeven; + std::vector Aodd; + + CoarseMatrix AselfInv; + CoarseMatrix AselfInvEven; + CoarseMatrix AselfInvOdd; + + Vector dag_factor; + /////////////////////// // Interface /////////////////////// GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know + GridBase * RedBlackGrid() { return _cbgrid; }; + + int ConstEE() { return 0; } void M (const CoarseVector &in, CoarseVector &out) { conformable(_grid,in.Grid()); conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); SimpleCompressor compressor; Stencil.HaloExchange(in,compressor); autoView( in_v , in, AcceleratorRead); autoView( out_v , out, AcceleratorWrite); + autoView( Stencil_v , Stencil, AcceleratorRead); + auto& geom_v = geom; typedef LatticeView Aview; Vector AcceleratorViewContainer; @@ -316,14 +380,14 @@ public: int ptype; StencilEntry *SE; - for(int point=0;point_is_local) { nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); } acceleratorSynchronise(); @@ -344,12 +408,72 @@ public: return M(in,out); } else { // corresponds to Galerkin coarsening - CoarseVector tmp(Grid()); - G5C(tmp, in); - M(tmp, out); - G5C(out, out); + return MdagNonHermitian(in, out); } }; + + void MdagNonHermitian(const CoarseVector &in, CoarseVector &out) + { + conformable(_grid,in.Grid()); + conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); + + SimpleCompressor compressor; + + Stencil.HaloExchange(in,compressor); + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); + autoView( Stencil_v , Stencil, AcceleratorRead); + auto& geom_v = geom; + typedef LatticeView Aview; + + Vector AcceleratorViewContainer; + + for(int p=0;poSites(); + + Vector points(geom.npoint, 0); + for(int p=0; poSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb compressor; @@ -359,6 +483,7 @@ public: { conformable(_grid,in.Grid()); conformable(_grid,out.Grid()); + out.Checkerboard() = in.Checkerboard(); typedef LatticeView Aview; Vector AcceleratorViewContainer; @@ -367,6 +492,7 @@ public: autoView( out_v , out, AcceleratorWrite); autoView( in_v , in, AcceleratorRead); + autoView( Stencil_v , Stencil, AcceleratorRead); const int Nsimd = CComplex::Nsimd(); typedef decltype(coalescedRead(in_v[0])) calcVector; @@ -380,12 +506,12 @@ public: int ptype; StencilEntry *SE; - SE=Stencil.GetEntry(ptype,point,ss); + SE=Stencil_v.GetEntry(ptype,point,ss); if(SE->_is_local) { nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); } acceleratorSynchronise(); @@ -413,34 +539,7 @@ public: this->MdirComms(in); - int ndim = in.Grid()->Nd(); - - ////////////// - // 4D action like wilson - // 0+ => 0 - // 0- => 1 - // 1+ => 2 - // 1- => 3 - // etc.. - ////////////// - // 5D action like DWF - // 1+ => 0 - // 1- => 1 - // 2+ => 2 - // 2- => 3 - // etc.. - auto point = [dir, disp, ndim](){ - if(dir == 0 and disp == 0) - return 8; - else if ( ndim==4 ) { - return (4 * dir + 1 - disp) / 2; - } else { - return (4 * (dir-1) + 1 - disp) / 2; - } - }(); - - MdirCalc(in,out,point); - + MdirCalc(in,out,geom.point(dir,disp)); }; void Mdiag(const CoarseVector &in, CoarseVector &out) @@ -449,17 +548,269 @@ public: MdirCalc(in, out, point); // No comms }; + void Mooee(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerNo, InverseNo); + } + + void MooeeInv(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerNo, InverseYes); + } + + void MooeeDag(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerYes, InverseNo); + } + + void MooeeInvDag(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerYes, InverseYes); + } + + void Meooe(const CoarseVector &in, CoarseVector &out) { + if(in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerNo); + } else { + DhopOE(in, out, DaggerNo); + } + } + + void MeooeDag(const CoarseVector &in, CoarseVector &out) { + if(in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerYes); + } else { + DhopOE(in, out, DaggerYes); + } + } + + void Dhop(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _grid); // verifies full grid + conformable(in.Grid(), out.Grid()); + + out.Checkerboard() = in.Checkerboard(); + + DhopInternal(Stencil, A, in, out, dag); + } + + void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Even); + out.Checkerboard() = Odd; + + DhopInternal(StencilEven, Aodd, in, out, dag); + } + + void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Odd); + out.Checkerboard() = Even; + + DhopInternal(StencilOdd, Aeven, in, out, dag); + } + + void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) { + out.Checkerboard() = in.Checkerboard(); + assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); + + CoarseMatrix *Aself = nullptr; + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1]; + DselfInternal(StencilOdd, *Aself, in, out, dag); + } else { + Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1]; + DselfInternal(StencilEven, *Aself, in, out, dag); + } + } else { + Aself = (inv) ? &AselfInv : &A[geom.npoint-1]; + DselfInternal(Stencil, *Aself, in, out, dag); + } + assert(Aself != nullptr); + } + + void DselfInternal(CartesianStencil &st, CoarseMatrix &a, + const CoarseVector &in, CoarseVector &out, int dag) { + int point = geom.npoint-1; + autoView( out_v, out, AcceleratorWrite); + autoView( in_v, in, AcceleratorRead); + autoView( st_v, st, AcceleratorRead); + autoView( a_v, a, AcceleratorRead); + + const int Nsimd = CComplex::Nsimd(); + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + + RealD* dag_factor_p = &dag_factor[0]; + + if(dag) { + accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + SE=st_v.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bboSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + SE=st_v.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb &st, std::vector &a, + const CoarseVector &in, CoarseVector &out, int dag) { + SimpleCompressor compressor; + + st.HaloExchange(in,compressor); + autoView( in_v, in, AcceleratorRead); + autoView( out_v, out, AcceleratorWrite); + autoView( st_v , st, AcceleratorRead); + typedef LatticeView Aview; + + // determine in what order we need the points + int npoint = geom.npoint-1; + Vector points(npoint, 0); + for(int p=0; p AcceleratorViewContainer; + for(int p=0;poSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bboSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb > &linop, Aggregation & Subspace) { @@ -608,6 +959,9 @@ public: std::cout << GridLogMessage << " ForceHermitian, new code "<lSites(); + + typedef typename Cobj::scalar_object scalar_object; + + autoView(Aself_v, A[geom.npoint-1], CpuRead); + autoView(AselfInv_v, AselfInv, CpuWrite); + thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke + Eigen::MatrixXcd selfLinkEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis); + Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis); + + scalar_object selfLink = Zero(); + scalar_object selfLinkInv = Zero(); + + Coordinate lcoor; + + Grid()->LocalIndexToLocalCoor(site, lcoor); + peekLocalSite(selfLink, Aself_v, lcoor); + + for (int i = 0; i < nbasis; ++i) + for (int j = 0; j < nbasis; ++j) + selfLinkEigen(i, j) = static_cast(TensorRemove(selfLink(i, j))); + + selfLinkInvEigen = selfLinkEigen.inverse(); + + for(int i = 0; i < nbasis; ++i) + for(int j = 0; j < nbasis; ++j) + selfLinkInv(i, j) = selfLinkInvEigen(i, j); + + pokeLocalSite(selfLinkInv, AselfInv_v, lcoor); + }); + } + + void FillHalfCbs() { + std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl; + for(int p = 0; p < geom.npoint; ++p) { + pickCheckerboard(Even, Aeven[p], A[p]); + pickCheckerboard(Odd, Aodd[p], A[p]); + } + pickCheckerboard(Even, AselfInvEven, AselfInv); + pickCheckerboard(Odd, AselfInvOdd, AselfInv); + } }; NAMESPACE_END(Grid); diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc deleted file mode 100644 index 0d1707d9..00000000 --- a/Grid/allocator/AlignedAllocator.cc +++ /dev/null @@ -1,67 +0,0 @@ -#include -#include - -NAMESPACE_BEGIN(Grid); - -MemoryStats *MemoryProfiler::stats = nullptr; -bool MemoryProfiler::debug = false; - -void check_huge_pages(void *Buf,uint64_t BYTES) -{ -#ifdef __linux__ - int fd = open("/proc/self/pagemap", O_RDONLY); - assert(fd >= 0); - const int page_size = 4096; - uint64_t virt_pfn = (uint64_t)Buf / page_size; - off_t offset = sizeof(uint64_t) * virt_pfn; - uint64_t npages = (BYTES + page_size-1) / page_size; - uint64_t pagedata[npages]; - uint64_t ret = lseek(fd, offset, SEEK_SET); - assert(ret == offset); - ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); - assert(ret == sizeof(uint64_t) * npages); - int nhugepages = npages / 512; - int n4ktotal, nnothuge; - n4ktotal = 0; - nnothuge = 0; - for (int i = 0; i < nhugepages; ++i) { - uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size; - for (int j = 0; j < 512; ++j) { - uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size; - ++n4ktotal; - if (pageaddr != baseaddr + j * page_size) - ++nnothuge; - } - } - int rank = CartesianCommunicator::RankWorld(); - printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); -#endif -} - -std::string sizeString(const size_t bytes) -{ - constexpr unsigned int bufSize = 256; - const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"}; - char buf[256]; - size_t s = 0; - double count = bytes; - - while (count >= 1024 && s < 7) - { - s++; - count /= 1024; - } - if (count - floor(count) == 0.0) - { - snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]); - } - else - { - snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]); - } - - return std::string(buf); -} - -NAMESPACE_END(Grid); - diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 249732fb..91622789 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -165,9 +165,18 @@ template inline bool operator!=(const devAllocator<_Tp>&, const d //////////////////////////////////////////////////////////////////////////////// // Template typedefs //////////////////////////////////////////////////////////////////////////////// -//template using commAllocator = devAllocator; -template using Vector = std::vector >; +#ifdef ACCELERATOR_CSHIFT +// Cshift on device +template using cshiftAllocator = devAllocator; +#else +// Cshift on host +template using cshiftAllocator = std::allocator; +#endif + +template using Vector = std::vector >; +template using stencilVector = std::vector >; template using commVector = std::vector >; +template using cshiftVector = std::vector >; NAMESPACE_END(Grid); diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index aac13aee..25c5b5f5 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -34,8 +34,6 @@ NAMESPACE_BEGIN(Grid); // Move control to configure.ac and Config.h? -#define ALLOCATION_CACHE -#define GRID_ALLOC_ALIGN (2*1024*1024) #define GRID_ALLOC_SMALL_LIMIT (4096) /*Pinning pages is costly*/ diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 5dd7575e..275ed5e0 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -1,11 +1,12 @@ #include - #ifndef GRID_UVM #warning "Using explicit device memory copies" NAMESPACE_BEGIN(Grid); +//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout); #define dprintf(...) + //////////////////////////////////////////////////////////// // For caching copies of data on device //////////////////////////////////////////////////////////// @@ -103,7 +104,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - // dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); assert(AccCache.CpuPtr!=(uint64_t)NULL); @@ -111,7 +112,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - // dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); + dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -125,7 +126,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - // dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); if(AccCache.state==AccDirty) { @@ -136,7 +137,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - // dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); + dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -149,7 +150,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); - // dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -164,7 +165,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - // dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; @@ -227,18 +228,24 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,hint); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - + if (!AccCache.AccPtr) { + EvictVictims(bytes); + } assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { + dprintf("ViewOpen found entry %llx %llx : %lld %lld\n", + (uint64_t)AccCache.CpuPtr, + (uint64_t)CpuPtr, + (uint64_t)AccCache.bytes, + (uint64_t)bytes); assert(AccCache.CpuPtr == CpuPtr); assert(AccCache.bytes ==bytes); } @@ -285,21 +292,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // CpuDirty + AccRead => Consistent } AccCache.accLock++; - // printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); } else if(AccCache.state==Consistent) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty else AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.accLock++; - // printf("Consistent entry into device accLock %d\n",AccCache.accLock); + dprintf("Consistent entry into device accLock %d\n",AccCache.accLock); } else if(AccCache.state==AccDirty) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty else AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.accLock++; - // printf("AccDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock); } else { assert(0); } @@ -361,13 +368,16 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,transient); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - + + if (!AccCache.AccPtr) { + EvictVictims(bytes); + } + assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache.accLock==0); // Programming error diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc index 537f7c32..3f165007 100644 --- a/Grid/allocator/MemoryManagerShared.cc +++ b/Grid/allocator/MemoryManagerShared.cc @@ -1,7 +1,6 @@ #include #ifdef GRID_UVM -#warning "Grid is assuming unified virtual memory address space" NAMESPACE_BEGIN(Grid); ///////////////////////////////////////////////////////////////////////////////// // View management is 1:1 address space mapping diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 83f71233..c6543851 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -44,7 +44,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { -#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori +#ifndef GRID_COMMS_THREADS nCommThreads=1; // wrong results here too // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs @@ -358,16 +358,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector ranks(size); for(int r=0;r #include NAMESPACE_BEGIN(Grid); +#define header "SharedMemoryNone: " /*Construct from an MPI communicator*/ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) @@ -55,6 +56,38 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M //////////////////////////////////////////////////////////////////////////////////////////// // Hugetlbfs mapping intended, use anonymous mmap //////////////////////////////////////////////////////////////////////////////////////////// +#if 1 +void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) +{ + std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "< > Cshift_table; // Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// template void -Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimension,int plane,int cbmask, int off=0) +Gather_plane_simple (const Lattice &rhs,cshiftVector &buffer,int dimension,int plane,int cbmask, int off=0) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen } } { - autoView(rhs_v , rhs, AcceleratorRead); auto buffer_p = & buffer[0]; auto table = &Cshift_table[0]; +#ifdef ACCELERATOR_CSHIFT + autoView(rhs_v , rhs, AcceleratorRead); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); }); +#else + autoView(rhs_v , rhs, CpuRead); + thread_for(i,ent,{ + buffer_p[table[i].first]=rhs_v[table[i].second]; + }); +#endif } } @@ -103,6 +110,7 @@ Gather_plane_extract(const Lattice &rhs, int n1=rhs.Grid()->_slice_stride[dimension]; if ( cbmask ==0x3){ +#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); accelerator_for2d(n,e1,b,e2,1,{ int o = n*n1; @@ -111,12 +119,22 @@ Gather_plane_extract(const Lattice &rhs, vobj temp =rhs_v[so+o+b]; extract(temp,pointers,offset); }); +#else + autoView(rhs_v , rhs, CpuRead); + thread_for2d(n,e1,b,e2,{ + int o = n*n1; + int offset = b+n*e2; + + vobj temp =rhs_v[so+o+b]; + extract(temp,pointers,offset); + }); +#endif } else { - autoView(rhs_v , rhs, AcceleratorRead); - Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate cdm =rhs.Grid()->_checker_dim_mask; std::cout << " Dense packed buffer WARNING " < &rhs, extract(temp,pointers,offset); } }); +#else + autoView(rhs_v , rhs, CpuRead); + thread_for2d(n,e1,b,e2,{ + + Coordinate coor; + + int o=n*n1; + int oindex = o+b; + + int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm); + + int ocb=1<(temp,pointers,offset); + } + }); +#endif } } ////////////////////////////////////////////////////// // Scatter for when there is no need to SIMD split ////////////////////////////////////////////////////// -template void Scatter_plane_simple (Lattice &rhs,commVector &buffer, int dimension,int plane,int cbmask) +template void Scatter_plane_simple (Lattice &rhs,cshiftVector &buffer, int dimension,int plane,int cbmask) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -182,12 +220,19 @@ template void Scatter_plane_simple (Lattice &rhs,commVector void Scatter_plane_merge(Lattice &rhs,ExtractPointerA int e2=rhs.Grid()->_slice_block[dimension]; if(cbmask ==0x3 ) { - autoView( rhs_v , rhs, AcceleratorWrite); int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension]; +#ifdef ACCELERATOR_CSHIFT + autoView( rhs_v , rhs, AcceleratorWrite); accelerator_for2d(n,e1,b,e2,1,{ int o = n*_slice_stride; int offset = b+n*_slice_block; merge(rhs_v[so+o+b],pointers,offset); }); +#else + autoView( rhs_v , rhs, CpuWrite); + thread_for2d(n,e1,b,e2,{ + int o = n*_slice_stride; + int offset = b+n*_slice_block; + merge(rhs_v[so+o+b],pointers,offset); + }); +#endif } else { // Case of SIMD split AND checker dim cannot currently be hit, except in @@ -280,12 +334,20 @@ template void Copy_plane(Lattice& lhs,const Lattice &rhs } { + auto table = &Cshift_table[0]; +#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); autoView(lhs_v , lhs, AcceleratorWrite); - auto table = &Cshift_table[0]; accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); }); +#else + autoView(rhs_v , rhs, CpuRead); + autoView(lhs_v , lhs, CpuWrite); + thread_for(i,ent,{ + lhs_v[table[i].first]=rhs_v[table[i].second]; + }); +#endif } } @@ -324,12 +386,20 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice void Cshift_comms_simd(Lattice& ret,const Lattice void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) { typedef typename vobj::vector_type vector_type; @@ -121,9 +122,9 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - commVector send_buf(buffer_size); - commVector recv_buf(buffer_size); - + cshiftVector send_buf(buffer_size); + cshiftVector recv_buf(buffer_size); + int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); @@ -138,7 +139,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r } else { - int words = send_buf.size(); + int words = buffer_size; if (cbmask != 0x3) words=words>>1; int bytes = words * sizeof(vobj); @@ -150,12 +151,14 @@ template void Cshift_comms(Lattice &ret,const Lattice &r int xmit_to_rank; grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + grid->Barrier(); grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, (void *)&recv_buf[0], recv_from_rank, bytes); + grid->Barrier(); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); @@ -195,8 +198,15 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - std::vector > send_buf_extract(Nsimd,commVector(buffer_size) ); - std::vector > recv_buf_extract(Nsimd,commVector(buffer_size) ); + std::vector > send_buf_extract(Nsimd); + std::vector > recv_buf_extract(Nsimd); + scalar_object * recv_buf_extract_mpi; + scalar_object * send_buf_extract_mpi; + + for(int s=0;s void Cshift_comms_simd(Lattice &ret,const LatticeShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0], + grid->Barrier(); + + send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; + recv_buf_extract_mpi = &recv_buf_extract[i][0]; + grid->SendToRecvFrom((void *)send_buf_extract_mpi, xmit_to_rank, - (void *)&recv_buf_extract[i][0], + (void *)recv_buf_extract_mpi, recv_from_rank, bytes); + + grid->Barrier(); + + rpointers[i] = &recv_buf_extract[i][0]; + } else { + rpointers[i] = &send_buf_extract[nbr_lane][0]; + } + + } + Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); + } + +} +#else +template void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) +{ + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + GridBase *grid=rhs.Grid(); + Lattice temp(rhs.Grid()); + + int fd = rhs.Grid()->_fdimensions[dimension]; + int rd = rhs.Grid()->_rdimensions[dimension]; + int pd = rhs.Grid()->_processors[dimension]; + int simd_layout = rhs.Grid()->_simd_layout[dimension]; + int comm_dim = rhs.Grid()->_processors[dimension] >1 ; + assert(simd_layout==1); + assert(comm_dim==1); + assert(shift>=0); + assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; + cshiftVector send_buf_v(buffer_size); + cshiftVector recv_buf_v(buffer_size); + vobj *send_buf; + vobj *recv_buf; + { + grid->ShmBufferFreeAll(); + size_t bytes = buffer_size*sizeof(vobj); + send_buf=(vobj *)grid->ShmBufferMalloc(bytes); + recv_buf=(vobj *)grid->ShmBufferMalloc(bytes); + } + + int cb= (cbmask==0x2)? Odd : Even; + int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); + + for(int x=0;x>1; + + int bytes = words * sizeof(vobj); + + Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); + + // int rank = grid->_processor; + int recv_from_rank; + int xmit_to_rank; + grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + + + grid->Barrier(); + + acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); + grid->SendToRecvFrom((void *)&send_buf[0], + xmit_to_rank, + (void *)&recv_buf[0], + recv_from_rank, + bytes); + acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); + + grid->Barrier(); + + Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); + } + } +} + +template void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) +{ + GridBase *grid=rhs.Grid(); + const int Nsimd = grid->Nsimd(); + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_object scalar_object; + typedef typename vobj::scalar_type scalar_type; + + int fd = grid->_fdimensions[dimension]; + int rd = grid->_rdimensions[dimension]; + int ld = grid->_ldimensions[dimension]; + int pd = grid->_processors[dimension]; + int simd_layout = grid->_simd_layout[dimension]; + int comm_dim = grid->_processors[dimension] >1 ; + + //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<=0); + assert(shiftPermuteType(dimension); + + /////////////////////////////////////////////// + // Simd direction uses an extract/merge pair + /////////////////////////////////////////////// + int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; + // int words = sizeof(vobj)/sizeof(vector_type); + + std::vector > send_buf_extract(Nsimd); + std::vector > recv_buf_extract(Nsimd); + scalar_object * recv_buf_extract_mpi; + scalar_object * send_buf_extract_mpi; + { + size_t bytes = sizeof(scalar_object)*buffer_size; + grid->ShmBufferFreeAll(); + send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); + recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); + } + for(int s=0;s pointers(Nsimd); // + ExtractPointerArray rpointers(Nsimd); // received pointers + + /////////////////////////////////////////// + // Work out what to send where + /////////////////////////////////////////// + int cb = (cbmask==0x2)? Odd : Even; + int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); + + // loop over outer coord planes orthog to dim + for(int x=0;x>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; + + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; + int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors + + int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer + int nbr_ox = (nbr_coor%rd); // outer coord of peer + int nbr_lane = (i&(~inner_bit)); + + int recv_from_rank; + int xmit_to_rank; + + if (nbr_ic) nbr_lane|=inner_bit; + + assert (sx == nbr_ox); + + if(nbr_proc){ + grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + grid->Barrier(); + + acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); + grid->SendToRecvFrom((void *)send_buf_extract_mpi, + xmit_to_rank, + (void *)recv_buf_extract_mpi, + recv_from_rank, + bytes); + acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); + grid->Barrier(); rpointers[i] = &recv_buf_extract[i][0]; } else { @@ -258,7 +461,7 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice Bt(Nm * max_threads); thread_region @@ -161,11 +161,13 @@ void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,in double * Qt_j = & Qt_jv[0]; for(int k=0;koSites(),vobj::Nsimd(),{ - auto B=coalescedRead(zz); + vobj zzz=Zero(); + auto B=coalescedRead(zzz); for(int k=k0; k & in) { convertType(out,in._internal); } +template::value, T1>::type* = nullptr> +accelerator_inline void convertType(T1 & out, const iScalar & in) { + convertType(out,in._internal); +} + template accelerator_inline void convertType(iScalar & out, const T2 & in) { convertType(out._internal,in); diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h index 0b726db9..9dca403b 100644 --- a/Grid/qcd/action/fermion/GparityWilsonImpl.h +++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h @@ -97,42 +97,30 @@ public: Coordinate icoor; #ifdef GRID_SIMT - _Spinor tmp; - const int Nsimd =SiteDoubledGaugeField::Nsimd(); int s = acceleratorSIMTlane(Nsimd); St.iCoorFromIindex(icoor,s); int mmu = mu % Nd; - if ( SE->_around_the_world && St.parameters.twists[mmu] ) { - - int permute_lane = (sl==1) - || ((distance== 1)&&(icoor[direction]==1)) - || ((distance==-1)&&(icoor[direction]==0)); - if ( permute_lane ) { - tmp(0) = chi(1); - tmp(1) = chi(0); - } else { - tmp(0) = chi(0); - tmp(1) = chi(1); - } + auto UU0=coalescedRead(U(0)(mu)); + auto UU1=coalescedRead(U(1)(mu)); + + //Decide whether we do a G-parity flavor twist + //Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir + //It also assumes (but does not check) that abs(distance) == 1 + int permute_lane = (sl==1) + || ((distance== 1)&&(icoor[direction]==1)) + || ((distance==-1)&&(icoor[direction]==0)); - auto UU0=coalescedRead(U(0)(mu)); - auto UU1=coalescedRead(U(1)(mu)); + permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world - mult(&phi(0),&UU0,&tmp(0)); - mult(&phi(1),&UU1,&tmp(1)); + //Apply the links + int f_upper = permute_lane ? 1 : 0; + int f_lower = !f_upper; - } else { - - auto UU0=coalescedRead(U(0)(mu)); - auto UU1=coalescedRead(U(1)(mu)); - - mult(&phi(0),&UU0,&chi(0)); - mult(&phi(1),&UU1,&chi(1)); - - } + mult(&phi(0),&UU0,&chi(f_upper)); + mult(&phi(1),&UU1,&chi(f_lower)); #else typedef _Spinor vobj; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index b3fbe096..f11e9c44 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -642,7 +642,7 @@ void CayleyFermion5D::ContractConservedCurrent( PropagatorField &q_in_1, Current curr_type, unsigned int mu) { -#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) +#if (!defined(GRID_HIP)) Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, @@ -826,7 +826,7 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, } #endif -#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) +#if (!defined(GRID_HIP)) int tshift = (mu == Nd-1) ? 1 : 0; //////////////////////////////////////////////// // GENERAL CAYLEY CASE diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h index e721c20d..3032a80c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h @@ -92,20 +92,16 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) int lvol = _Umu.Grid()->lSites(); int DimRep = Impl::Dimension; - Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); - Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); - - Coordinate lcoor; - typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); - { autoView(CTv,CloverTerm,CpuRead); autoView(CTIv,CloverTermInv,CpuWrite); - for (int site = 0; site < lvol; site++) { + thread_for(site, lvol, { + Coordinate lcoor; grid->LocalIndexToLocalCoor(site, lcoor); - EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); peekLocalSite(Qx, CTv, lcoor); - Qxinv = Zero(); //if (csw!=0){ for (int j = 0; j < Ns; j++) for (int k = 0; k < Ns; k++) @@ -126,7 +122,7 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) // if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl; // } pokeLocalSite(Qxinv, CTIv, lcoor); - } + }); } // Separate the even and odd parts diff --git a/Grid/qcd/action/gauge/GaugeImplTypes.h b/Grid/qcd/action/gauge/GaugeImplTypes.h index 9b7d5a60..55a20eca 100644 --- a/Grid/qcd/action/gauge/GaugeImplTypes.h +++ b/Grid/qcd/action/gauge/GaugeImplTypes.h @@ -154,6 +154,10 @@ public: return Hsum.real(); } + static inline void Project(Field &U) { + ProjectSUn(U); + } + static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { SU::HotConfiguration(pRNG, U); } diff --git a/Grid/qcd/action/scalar/ScalarImpl.h b/Grid/qcd/action/scalar/ScalarImpl.h index 14675b11..403ea573 100644 --- a/Grid/qcd/action/scalar/ScalarImpl.h +++ b/Grid/qcd/action/scalar/ScalarImpl.h @@ -54,6 +54,10 @@ public: static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { U = 1.0; } + + static inline void Project(Field &U) { + return; + } static void MomentumSpacePropagator(Field &out, RealD m) { @@ -234,6 +238,10 @@ public: #endif //USE_FFT_ACCELERATION } + static inline void Project(Field &U) { + return; + } + static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U); } diff --git a/Grid/qcd/hmc/GenericHMCrunner.h b/Grid/qcd/hmc/GenericHMCrunner.h index c2443dd0..98e8175a 100644 --- a/Grid/qcd/hmc/GenericHMCrunner.h +++ b/Grid/qcd/hmc/GenericHMCrunner.h @@ -159,6 +159,13 @@ private: Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U, Resources.GetSerialRNG(), Resources.GetParallelRNG()); + } else { + // others + std::cout << GridLogError << "Unrecognized StartingType\n"; + std::cout + << GridLogError + << "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + exit(1); } Smearing.set_Field(U); diff --git a/Grid/qcd/hmc/HMC.h b/Grid/qcd/hmc/HMC.h index 0f933204..f168b69a 100644 --- a/Grid/qcd/hmc/HMC.h +++ b/Grid/qcd/hmc/HMC.h @@ -95,7 +95,7 @@ private: typedef typename IntegratorType::Field Field; typedef std::vector< HmcObservable * > ObsListType; - + //pass these from the resource manager GridSerialRNG &sRNG; GridParallelRNG &pRNG; diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index d5475704..70055754 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -313,6 +313,8 @@ public: std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; } + FieldImplementation::Project(U); + // and that we indeed got to the end of the trajectory assert(fabs(t_U - Params.trajL) < 1.0e-6); diff --git a/Grid/qcd/utils/BaryonUtils.h b/Grid/qcd/utils/BaryonUtils.h index b268b684..15516b56 100644 --- a/Grid/qcd/utils/BaryonUtils.h +++ b/Grid/qcd/utils/BaryonUtils.h @@ -51,7 +51,7 @@ public: private: template - static void baryon_site(const mobj &D1, + static void BaryonSite(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -61,8 +61,18 @@ public: const int parity, const bool * wick_contractions, robj &result); + template + static void BaryonSiteMatrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool * wick_contractions, + robj &result); public: - static void Wick_Contractions(std::string qi, + static void WickContractions(std::string qi, std::string qf, bool* wick_contractions); static void ContractBaryons(const PropagatorField &q1_left, @@ -75,8 +85,17 @@ public: const bool* wick_contractions, const int parity, ComplexField &baryon_corr); + static void ContractBaryonsMatrix(const PropagatorField &q1_left, + const PropagatorField &q2_left, + const PropagatorField &q3_left, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + SpinMatrixField &baryon_corr); template - static void ContractBaryons_Sliced(const mobj &D1, + static void ContractBaryonsSliced(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -87,9 +106,20 @@ public: const int parity, const int nt, robj &result); + template + static void ContractBaryonsSlicedMatrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + const int nt, + robj &result); private: template - static void Baryon_Gamma_3pt_Group1_Site( + static void BaryonGamma3ptGroup1Site( const mobj &Dq1_ti, const mobj2 &Dq2_spec, const mobj2 &Dq3_spec, @@ -101,7 +131,7 @@ public: robj &result); template - static void Baryon_Gamma_3pt_Group2_Site( + static void BaryonGamma3ptGroup2Site( const mobj2 &Dq1_spec, const mobj &Dq2_ti, const mobj2 &Dq3_spec, @@ -113,7 +143,7 @@ public: robj &result); template - static void Baryon_Gamma_3pt_Group3_Site( + static void BaryonGamma3ptGroup3Site( const mobj2 &Dq1_spec, const mobj2 &Dq2_spec, const mobj &Dq3_ti, @@ -125,7 +155,7 @@ public: robj &result); public: template - static void Baryon_Gamma_3pt( + static void BaryonGamma3pt( const PropagatorField &q_ti, const mobj &Dq_spec1, const mobj &Dq_spec2, @@ -138,7 +168,7 @@ public: SpinMatrixField &stn_corr); private: template - static void Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, + static void SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, @@ -147,7 +177,7 @@ public: const Gamma GammaB_nucl, robj &result); template - static void Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti, + static void SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, const mobj &Du_tf, const mobj2 &Du_spec, const mobj &Dd_tf, @@ -159,7 +189,7 @@ public: template - static void Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop, + static void SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, @@ -168,7 +198,7 @@ public: const Gamma GammaB_nucl, robj &result); template - static void Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti, + static void SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, const mobj &Du_tf, const mobj2 &Du_spec, const mobj &Dd_tf, @@ -179,7 +209,7 @@ public: robj &result); public: template - static void Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, + static void SigmaToNucleonEye(const PropagatorField &qq_loop, const mobj &Du_spec, const PropagatorField &qd_tf, const PropagatorField &qs_ti, @@ -189,7 +219,7 @@ public: const std::string op, SpinMatrixField &stn_corr); template - static void Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, + static void SigmaToNucleonNonEye(const PropagatorField &qq_ti, const PropagatorField &qq_tf, const mobj &Du_spec, const PropagatorField &qd_tf, @@ -217,7 +247,7 @@ const Real BaryonUtils::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.}; //This is the old version template template -void BaryonUtils::baryon_site(const mobj &D1, +void BaryonUtils::BaryonSite(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_i, @@ -329,12 +359,132 @@ void BaryonUtils::baryon_site(const mobj &D1, }} } +//New version without parity projection or trace +template +template +void BaryonUtils::BaryonSiteMatrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_i, + const Gamma GammaB_i, + const Gamma GammaA_f, + const Gamma GammaB_f, + const bool * wick_contraction, + robj &result) +{ + + auto D1_GAi = D1 * GammaA_i; + auto GAf_D1_GAi = GammaA_f * D1_GAi; + auto GBf_D1_GAi = GammaB_f * D1_GAi; + + auto D2_GBi = D2 * GammaB_i; + auto GBf_D2_GBi = GammaB_f * D2_GBi; + auto GAf_D2_GBi = GammaA_f * D2_GBi; + + auto GBf_D3 = GammaB_f * D3; + auto GAf_D3 = GammaA_f * D3; + + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = epsilon[ie_f][0]; //a + int b_f = epsilon[ie_f][1]; //b + int c_f = epsilon[ie_f][2]; //c + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = epsilon[ie_i][0]; //a' + int b_i = epsilon[ie_i][1]; //b' + int c_i = epsilon[ie_i][2]; //c' + + Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + //This is the \delta_{456}^{123} part + if (wick_contraction[0]){ + for (int rho_i=0; rho_i -void BaryonUtils::Wick_Contractions(std::string qi, std::string qf, bool* wick_contractions) { +void BaryonUtils::WickContractions(std::string qi, std::string qf, bool* wick_contractions) { const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; for (int ie=0; ie < 6 ; ie++) { wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3 @@ -364,11 +514,6 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); @@ -397,13 +542,62 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, auto D2 = v2[ss]; auto D3 = v3[ss]; vobj result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result); + BaryonSite(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result); vbaryon_corr[ss] = result; } );//end loop over lattice sites t += usecond(); - std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; + std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; +} + +template +void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, + const PropagatorField &q2_left, + const PropagatorField &q3_left, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + SpinMatrixField &baryon_corr) +{ + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + GridBase *grid = q1_left.Grid(); + + autoView(vbaryon_corr, baryon_corr,CpuWrite); + autoView( v1 , q1_left, CpuRead); + autoView( v2 , q2_left, CpuRead); + autoView( v3 , q3_left, CpuRead); + + // Real bytes =0.; + // bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real)); + // for (int ie=0; ie < 6 ; ie++){ + // if(ie==0 or ie==3){ + // bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie]; + // } + // else{ + // bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie]; + // } + // } + // Real t=0.; + // t =-usecond(); + + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto D1 = v1[ss]; + auto D2 = v2[ss]; + auto D3 = v3[ss]; + sobj result=Zero(); + BaryonSiteMatrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result); + vbaryon_corr[ss] = result; + } );//end loop over lattice sites + + // t += usecond(); + + // std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; } @@ -414,7 +608,7 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, * Wick_Contractions function above */ template template -void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, +void BaryonUtils::ContractBaryonsSliced(const mobj &D1, const mobj &D2, const mobj &D3, const Gamma GammaA_left, @@ -429,16 +623,33 @@ void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); for (int t=0; t +template +void BaryonUtils::ContractBaryonsSlicedMatrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const bool* wick_contractions, + const int nt, + robj &result) +{ + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + for (int t=0; t::ContractBaryons_Sliced(const mobj &D1, * Dq4_tf is a quark line from t_f to t_J */ template template -void BaryonUtils::Baryon_Gamma_3pt_Group1_Site( +void BaryonUtils::BaryonGamma3ptGroup1Site( const mobj &Dq1_ti, const mobj2 &Dq2_spec, const mobj2 &Dq3_spec, @@ -546,7 +757,7 @@ void BaryonUtils::Baryon_Gamma_3pt_Group1_Site( * Dq4_tf is a quark line from t_f to t_J */ template template -void BaryonUtils::Baryon_Gamma_3pt_Group2_Site( +void BaryonUtils::BaryonGamma3ptGroup2Site( const mobj2 &Dq1_spec, const mobj &Dq2_ti, const mobj2 &Dq3_spec, @@ -636,7 +847,7 @@ void BaryonUtils::Baryon_Gamma_3pt_Group2_Site( * Dq4_tf is a quark line from t_f to t_J */ template template -void BaryonUtils::Baryon_Gamma_3pt_Group3_Site( +void BaryonUtils::BaryonGamma3ptGroup3Site( const mobj2 &Dq1_spec, const mobj2 &Dq2_spec, const mobj &Dq3_ti, @@ -728,7 +939,7 @@ void BaryonUtils::Baryon_Gamma_3pt_Group3_Site( * https://aportelli.github.io/Hadrons-doc/#/mcontraction */ template template -void BaryonUtils::Baryon_Gamma_3pt( +void BaryonUtils::BaryonGamma3pt( const PropagatorField &q_ti, const mobj &Dq_spec1, const mobj &Dq_spec2, @@ -751,7 +962,7 @@ void BaryonUtils::Baryon_Gamma_3pt( auto Dq_ti = vq_ti[ss]; auto Dq_tf = vq_tf[ss]; sobj result=Zero(); - Baryon_Gamma_3pt_Group1_Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); vcorr[ss] += result; });//end loop over lattice sites } else if (group == 2) { @@ -759,7 +970,7 @@ void BaryonUtils::Baryon_Gamma_3pt( auto Dq_ti = vq_ti[ss]; auto Dq_tf = vq_tf[ss]; sobj result=Zero(); - Baryon_Gamma_3pt_Group2_Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + BaryonGamma3ptGroup2Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); vcorr[ss] += result; });//end loop over lattice sites } else if (group == 3) { @@ -767,7 +978,7 @@ void BaryonUtils::Baryon_Gamma_3pt( auto Dq_ti = vq_ti[ss]; auto Dq_tf = vq_tf[ss]; sobj result=Zero(); - Baryon_Gamma_3pt_Group3_Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + BaryonGamma3ptGroup3Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); vcorr[ss] += result; });//end loop over lattice sites @@ -787,7 +998,7 @@ void BaryonUtils::Baryon_Gamma_3pt( * Ds_ti is a quark line from t_i to t_H */ template template -void BaryonUtils::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, +void BaryonUtils::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, @@ -838,7 +1049,7 @@ void BaryonUtils::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, * Ds_ti is a quark line from t_i to t_H */ template template -void BaryonUtils::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti, +void BaryonUtils::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, const mobj &Du_tf, const mobj2 &Du_spec, const mobj &Dd_tf, @@ -897,7 +1108,7 @@ void BaryonUtils::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti, * Ds_ti is a quark line from t_i to t_H */ template template -void BaryonUtils::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop, +void BaryonUtils::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, const mobj2 &Du_spec, const mobj &Dd_tf, const mobj &Ds_ti, @@ -948,7 +1159,7 @@ void BaryonUtils::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop, * Ds_ti is a quark line from t_i to t_H */ template template -void BaryonUtils::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti, +void BaryonUtils::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, const mobj &Du_tf, const mobj2 &Du_spec, const mobj &Dd_tf, @@ -1002,7 +1213,7 @@ void BaryonUtils::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti, template template -void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, +void BaryonUtils::SigmaToNucleonEye(const PropagatorField &qq_loop, const mobj &Du_spec, const PropagatorField &qd_tf, const PropagatorField &qs_ti, @@ -1029,9 +1240,9 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, auto Ds_ti = vs_ti[ss]; sobj result=Zero(); if(op == "Q1"){ - Sigma_to_Nucleon_Q1_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ1EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else if(op == "Q2"){ - Sigma_to_Nucleon_Q2_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ2EyeSite(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else { assert(0 && "Weak Operator not correctly specified"); } @@ -1041,7 +1252,7 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, template template -void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, +void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, const PropagatorField &qq_tf, const mobj &Du_spec, const PropagatorField &qd_tf, @@ -1071,9 +1282,9 @@ void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, auto Ds_ti = vs_ti[ss]; sobj result=Zero(); if(op == "Q1"){ - Sigma_to_Nucleon_Q1_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ1NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else if(op == "Q2"){ - Sigma_to_Nucleon_Q2_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + SigmaToNucleonQ2NonEyeSite(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); } else { assert(0 && "Weak Operator not correctly specified"); } diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h index 7ac53246..675493b3 100644 --- a/Grid/qcd/utils/SUn.h +++ b/Grid/qcd/utils/SUn.h @@ -735,7 +735,6 @@ public: } } - template static void HotConfiguration(GridParallelRNG &pRNG, GaugeField &out) { typedef typename GaugeField::vector_type vector_type; @@ -800,6 +799,88 @@ public: } }; +template +LatticeComplexD Determinant(const Lattice > > > &Umu) +{ + GridBase *grid=Umu.Grid(); + auto lvol = grid->lSites(); + LatticeComplexD ret(grid); + + autoView(Umu_v,Umu,CpuRead); + autoView(ret_v,ret,CpuWrite); + thread_for(site,lvol,{ + Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N); + Coordinate lcoor; + grid->LocalIndexToLocalCoor(site, lcoor); + iScalar > > Us; + peekLocalSite(Us, Umu_v, lcoor); + for(int i=0;i +static void ProjectSUn(Lattice > > > &Umu) +{ + Umu = ProjectOnGroup(Umu); + auto det = Determinant(Umu); + + det = conjugate(det); + + for(int i=0;i(Umu,N-1,i); + element = element * det; + PokeIndex(Umu,element,Nc-1,i); + } +} +template +static void ProjectSUn(Lattice >,Nd> > &U) +{ + GridBase *grid=U.Grid(); + // Reunitarise + for(int mu=0;mu(U,mu); + Umu = ProjectOnGroup(Umu); + ProjectSUn(Umu); + PokeIndex(U,Umu,mu); + } +} +// Explicit specialisation for SU(3). +// Explicit specialisation for SU(3). +static void +ProjectSU3 (Lattice > > > &Umu) +{ + GridBase *grid=Umu.Grid(); + const int x=0; + const int y=1; + const int z=2; + // Reunitarise + Umu = ProjectOnGroup(Umu); + autoView(Umu_v,Umu,CpuWrite); + thread_for(ss,grid->oSites(),{ + auto cm = Umu_v[ss]; + cm()()(2,x) = adj(cm()()(0,y)*cm()()(1,z)-cm()()(0,z)*cm()()(1,y)); //x= yz-zy + cm()()(2,y) = adj(cm()()(0,z)*cm()()(1,x)-cm()()(0,x)*cm()()(1,z)); //y= zx-xz + cm()()(2,z) = adj(cm()()(0,x)*cm()()(1,y)-cm()()(0,y)*cm()()(1,x)); //z= xy-yx + Umu_v[ss]=cm; + }); +} +static void ProjectSU3(Lattice >,Nd> > &U) +{ + GridBase *grid=U.Grid(); + // Reunitarise + for(int mu=0;mu(U,mu); + Umu = ProjectOnGroup(Umu); + ProjectSU3(Umu); + PokeIndex(U,Umu,mu); + } +} + typedef SU<2> SU2; typedef SU<3> SU3; typedef SU<4> SU4; diff --git a/Grid/serialisation/JSON_IO.cc b/Grid/serialisation/JSON_IO.cc index aca8bab3..f2282099 100644 --- a/Grid/serialisation/JSON_IO.cc +++ b/Grid/serialisation/JSON_IO.cc @@ -26,7 +26,7 @@ *************************************************************************************/ /* END LEGAL */ #include -#ifndef __NVCC__ +#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) NAMESPACE_BEGIN(Grid); diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 8b17f75a..8e55ce2f 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -38,12 +38,20 @@ Author: Peter Boyle #ifdef GRID_HIP #include #endif +#ifdef GRID_SYCL +namespace Grid { + typedef struct { uint16_t x;} half; + typedef struct { half x; half y;} half2; + typedef struct { float x; float y;} float2; + typedef struct { double x; double y;} double2; +} +#endif + namespace Grid { -#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) -typedef struct { uint16_t x;} half; -#endif + + typedef struct Half2_t { half x; half y; } Half2; #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH ) @@ -156,7 +164,7 @@ accelerator_inline float half2float(half h) f = __half2float(h); #else Grid_half hh; - hh.x = hr.x; + hh.x = h.x; f= sfw_half_to_float(hh); #endif return f; diff --git a/Grid/simd/Grid_vector_unops.h b/Grid/simd/Grid_vector_unops.h index d225699b..b89bb785 100644 --- a/Grid/simd/Grid_vector_unops.h +++ b/Grid/simd/Grid_vector_unops.h @@ -125,14 +125,6 @@ accelerator_inline Grid_simd sqrt(const Grid_simd &r) { return SimdApply(SqrtRealFunctor(), r); } template -accelerator_inline Grid_simd rsqrt(const Grid_simd &r) { - return SimdApply(RSqrtRealFunctor(), r); -} -template -accelerator_inline Scalar rsqrt(const Scalar &r) { - return (RSqrtRealFunctor(), r); -} -template accelerator_inline Grid_simd cos(const Grid_simd &r) { return SimdApply(CosRealFunctor(), r); } diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 1e198972..23fc8203 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -269,7 +269,7 @@ public: std::vector > > face_table ; Vector surface_list; - Vector _entries; // Resident in managed memory + stencilVector _entries; // Resident in managed memory std::vector Packets; std::vector Mergers; std::vector MergersSHM; diff --git a/Grid/tensors/Tensor_Ta.h b/Grid/tensors/Tensor_Ta.h index 1ef9fc23..bbaa4a00 100644 --- a/Grid/tensors/Tensor_Ta.h +++ b/Grid/tensors/Tensor_Ta.h @@ -95,14 +95,18 @@ accelerator_inline iMatrix ProjectOnGroup(const iMatrix &arg) vtype nrm; vtype inner; for(int c1=0;c1memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();} inline int acceleratorIsCommunicable(void *ptr) { #if 0 @@ -328,10 +333,12 @@ inline void *acceleratorAllocDevice(size_t bytes) return ptr; }; -inline void acceleratorFreeShared(void *ptr){ free(ptr);}; +inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} +inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);} #endif @@ -354,7 +361,7 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemc ////////////////////////////////////////////// // CPU Target - No accelerator just thread instead ////////////////////////////////////////////// -#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned + #if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) ) #undef GRID_SIMT @@ -369,8 +376,10 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemc accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} +inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline int acceleratorIsCommunicable(void *ptr){ return 1; } +inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} #ifdef HAVE_MM_MALLOC_H inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; @@ -393,6 +402,8 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN, inline void acceleratorFreeCpu (void *ptr){free(ptr);}; #endif + + /////////////////////////////////////////////////// // Synchronise across local threads for divergence resynch /////////////////////////////////////////////////// diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index 37d16176..9be39e94 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -473,11 +473,13 @@ void Grid_init(int *argc,char ***argv) LebesgueOrder::UseLebesgueOrder=1; } CartesianCommunicator::nCommThreads = 1; +#ifdef GRID_COMMS_THREADS if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads"); GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads); assert(CartesianCommunicator::nCommThreads > 0); } +#endif if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); GridCmdOptionIntVector(arg,LebesgueOrder::Block); diff --git a/benchmarks/Benchmark_IO.cc b/benchmarks/Benchmark_IO.cc index 0d80d425..87e7224d 100644 --- a/benchmarks/Benchmark_IO.cc +++ b/benchmarks/Benchmark_IO.cc @@ -1,4 +1,3 @@ - #include "Benchmark_IO.hpp" #ifndef BENCH_IO_LMIN @@ -13,6 +12,7 @@ #define BENCH_IO_NPASS 10 #endif +#ifdef HAVE_LIME using namespace Grid; std::string filestem(const int l) @@ -196,3 +196,6 @@ int main (int argc, char ** argv) return EXIT_SUCCESS; } +#else +int main(int argc,char ** argv){} +#endif diff --git a/benchmarks/Benchmark_IO.hpp b/benchmarks/Benchmark_IO.hpp index c4a6ca58..2ff42d52 100644 --- a/benchmarks/Benchmark_IO.hpp +++ b/benchmarks/Benchmark_IO.hpp @@ -2,12 +2,12 @@ #define Benchmark_IO_hpp_ #include -#ifdef HAVE_LIME #define MSG std::cout << GridLogMessage #define SEP \ "-----------------------------------------------------------------------------" #define BIGSEP \ "=============================================================================" +#ifdef HAVE_LIME namespace Grid { diff --git a/benchmarks/Benchmark_IO_vs_dir.cc b/benchmarks/Benchmark_IO_vs_dir.cc index e030bc39..8252547b 100644 --- a/benchmarks/Benchmark_IO_vs_dir.cc +++ b/benchmarks/Benchmark_IO_vs_dir.cc @@ -1,5 +1,5 @@ #include "Benchmark_IO.hpp" - +#ifdef HAVE_LIME using namespace Grid; int main (int argc, char ** argv) @@ -97,3 +97,6 @@ int main (int argc, char ** argv) return EXIT_SUCCESS; } +#else +int main(int argc,char ** argv){} +#endif diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index 54fe1ab0..032535b3 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -334,8 +334,9 @@ public: int threads = GridThread::GetThreads(); Coordinate mpi = GridDefaultMpi(); assert(mpi.size()==4); Coordinate local({L,L,L,L}); + Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); - GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(Coordinate({72,72,72,72}), + GridCartesian * TmpGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()), GridDefaultMpi()); uint64_t NP = TmpGrid->RankCount(); @@ -343,7 +344,6 @@ public: NN_global=NN; uint64_t SHM=NP/NN; - Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); ///////// Welcome message //////////// std::cout<RankCount(); @@ -507,7 +512,6 @@ public: NN_global=NN; uint64_t SHM=NP/NN; - Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); ///////// Welcome message //////////// std::cout< > xbuf(8); - std::vector > rbuf(8); + std::vector > xbuf(8); + std::vector > rbuf(8); for(int mu=0;mu<8;mu++){ xbuf[mu].resize(lat*lat*lat*Ls); diff --git a/benchmarks/Benchmark_comms_host_device.cc b/benchmarks/Benchmark_comms_host_device.cc new file mode 100644 index 00000000..591b5597 --- /dev/null +++ b/benchmarks/Benchmark_comms_host_device.cc @@ -0,0 +1,260 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_comms.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +struct time_statistics{ + double mean; + double err; + double min; + double max; + + void statistics(std::vector v){ + double sum = std::accumulate(v.begin(), v.end(), 0.0); + mean = sum / v.size(); + + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); + + auto result = std::minmax_element(v.begin(), v.end()); + min = *result.first; + max = *result.second; +} +}; + +void header(){ + std::cout <1) nmu++; + + std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; + std::vector t_time(Nloop); + time_statistics timestat; + + std::cout< > xbuf(8); + std::vector > rbuf(8); + + for(int mu=0;mu<8;mu++){ + xbuf[mu].resize(lat*lat*lat*Ls); + rbuf[mu].resize(lat*lat*lat*Ls); + } + uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + + int ncomm; + + for(int mu=0;mu<4;mu++){ + if (mpi_layout[mu]>1 ) { + double start=usecond(); + for(int i=0;i requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); + } + + comm_proc = mpi_layout[mu]-1; + { + std::vector requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); + } + } + Grid.Barrier(); + double stop=usecond(); + double mean=(stop-start)/Nloop; + double dbytes = bytes*ppn; + double xbytes = dbytes*2.0*ncomm; + double rbytes = xbytes; + double bidibytes = xbytes+rbytes; + + std::cout< xbuf(8); + std::vector rbuf(8); + + uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + } + + int ncomm; + + for(int mu=0;mu<4;mu++){ + if (mpi_layout[mu]>1 ) { + double start=usecond(); + for(int i=0;i requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); + } + + comm_proc = mpi_layout[mu]-1; + { + std::vector requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); + } + } + Grid.Barrier(); + double stop=usecond(); + double mean=(stop-start)/Nloop; + double dbytes = bytes*ppn; + double xbytes = dbytes*2.0*ncomm; + double rbytes = xbytes; + double bidibytes = xbytes+rbytes; + + std::cout< Make.inc echo >> Make.inc echo CCFILES=$CCFILES >> Make.inc - +echo ZWILS_FERMION_FILES=$ZWILS_FERMION_FILES >> Make.inc +echo WILS_FERMION_FILES=$WILS_FERMION_FILES >> Make.inc +echo STAG_FERMION_FILES=$STAG_FERMION_FILES >> Make.inc +echo GP_FERMION_FILES=$GP_FERMION_FILES >> Make.inc +echo ADJ_FERMION_FILES=$ADJ_FERMION_FILES >> Make.inc +echo TWOIND_FERMION_FILES=$TWOIND_FERMION_FILES >> Make.inc # tests Make.inc cd $home/tests @@ -26,11 +40,10 @@ for subdir in $dirs; do echo "tests-local: ${TESTLIST} " > Make.inc echo ${PREF}_PROGRAMS = ${TESTLIST} >> Make.inc echo >> Make.inc - HADLINK=`[ $subdir = './hadrons' ] && echo '-lHadrons '` for f in $TESTS; do BNAME=`basename $f .cc` echo ${BNAME}_SOURCES=$f >> Make.inc - echo ${BNAME}_LDADD=${HADLINK}-lGrid >> Make.inc + echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a' >> Make.inc echo >> Make.inc done if [ $subdir != '.' ]; then @@ -49,7 +62,7 @@ echo >> Make.inc for f in $TESTS; do BNAME=`basename $f .cc` echo ${BNAME}_SOURCES=$f >> Make.inc - echo ${BNAME}_LDADD=-lGrid>> Make.inc + echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a' >> Make.inc echo >> Make.inc done cd .. @@ -65,7 +78,7 @@ echo >> Make.inc for f in $TESTS; do BNAME=`basename $f .cc` echo ${BNAME}_SOURCES=$f >> Make.inc - echo ${BNAME}_LDADD=-lGrid>> Make.inc + echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a'>> Make.inc echo >> Make.inc done cd .. diff --git a/tests/core/Test_reunitarise.cc b/tests/core/Test_reunitarise.cc new file mode 100644 index 00000000..9a6781f1 --- /dev/null +++ b/tests/core/Test_reunitarise.cc @@ -0,0 +1,144 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_quenched_update.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt({8,8,8,8}); + GridCartesian * grid = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexD::Nsimd()), + GridDefaultMpi()); + + GridCartesian * gridF = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexF::Nsimd()), + GridDefaultMpi()); + + + /////////////////////////////// + // Configuration of known size + /////////////////////////////// + LatticeColourMatrixD ident(grid); + LatticeColourMatrixD U(grid); + LatticeColourMatrixD UU(grid); + LatticeColourMatrixD tmp(grid); + LatticeColourMatrixD org(grid); + LatticeColourMatrixF UF(gridF); + + LatticeGaugeField Umu(grid); + + ident =1.0; + + // RNG set up for test + std::vector pseeds({1,2,3,4,5}); // once I caught a fish alive + std::vector sseeds({6,7,8,9,10});// then i let it go again + GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds); + + SU::HotConfiguration(pRNG,Umu); + + U = PeekIndex(Umu,0); + org=U; + + + tmp= U*adj(U) - ident ; + RealD Def1 = norm2( tmp ); + std::cout << " Defect1 "<(U,Nc-1,i); + element = element * phase; + PokeIndex(U,element,Nc-1,i); + } + UU=U; + + detU= Determinant(U) ; + detU=detU-1.0; + std::cout << "Determinant defect before projection " < +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt({8,8,8,8}); + GridCartesian * grid = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexD::Nsimd()), + GridDefaultMpi()); + + GridCartesian * gridF = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexF::Nsimd()), + GridDefaultMpi()); + + + /////////////////////////////// + // Configuration of known size + /////////////////////////////// + LatticeColourMatrixD ident(grid); + LatticeColourMatrixD U(grid); + LatticeColourMatrixD tmp(grid); + LatticeColourMatrixD org(grid); + LatticeColourMatrixF UF(gridF); + + LatticeGaugeField Umu(grid); + + ident =1.0; + + // RNG set up for test + std::vector pseeds({1,2,3,4,5}); // once I caught a fish alive + std::vector sseeds({6,7,8,9,10});// then i let it go again + GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds); + + SU::HotConfiguration(pRNG,Umu); + + U = PeekIndex(Umu,0); + org=U; + + + tmp= U*adj(U) - ident ; + RealD Def1 = norm2( tmp ); + std::cout << " Defect1 "<::ColdConfiguration(Umu); - // SU::HotConfiguration(RNG4,Umu); + if( argc > 1 && argv[1][0] != '-' ) + { + std::cout<::ColdConfiguration(Umu); + // SU::HotConfiguration(RNG4,Umu); + } RealD mass=0.3; RealD M5 =1.0; diff --git a/tests/solver/Test_coarse_even_odd.cc b/tests/solver/Test_coarse_even_odd.cc new file mode 100644 index 00000000..dfbab747 --- /dev/null +++ b/tests/solver/Test_coarse_even_odd.cc @@ -0,0 +1,475 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/solver/Test_coarse_even_odd.cc + + Copyright (C) 2015-2020 + + Author: Daniel Richtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + +#include + +using namespace Grid; + +#ifndef NBASIS +#define NBASIS 40 +#endif + +// NOTE: The tests in this file are written in analogy to +// - tests/core/Test_wilson_even_odd.cc +// - tests/core/Test_wilson_clover.cc + +std::vector readFromCommandlineIvec(int* argc, + char*** argv, + std::string&& option, + const std::vector& defaultValue) { + std::string arg; + std::vector ret(defaultValue); + if(GridCmdOptionExists(*argv, *argv + *argc, option)) { + arg = GridCmdOptionPayload(*argv, *argv + *argc, option); + GridCmdOptionIntVector(arg, ret); + } + return ret; +} + +int main(int argc, char** argv) { + Grid_init(&argc, &argv); + + ///////////////////////////////////////////////////////////////////////////// + // Read from command line // + ///////////////////////////////////////////////////////////////////////////// + + const int nbasis = NBASIS; static_assert((nbasis & 0x1) == 0, ""); + const int nb = nbasis/2; + Coordinate blockSize = readFromCommandlineIvec(&argc, &argv, "--blocksize", {2, 2, 2, 2}); + + std::cout << GridLogMessage << "Compiled with nbasis = " << nbasis << " -> nb = " << nb << std::endl; + + ///////////////////////////////////////////////////////////////////////////// + // General setup // + ///////////////////////////////////////////////////////////////////////////// + + Coordinate clatt = GridDefaultLatt(); + for(int d=0; dshow_decomposition(); + std::cout << GridLogMessage << "Grid_c:" << std::endl; Grid_c->show_decomposition(); + std::cout << GridLogMessage << "RBGrid_f:" << std::endl; RBGrid_f->show_decomposition(); + std::cout << GridLogMessage << "RBGrid_c:" << std::endl; RBGrid_c->show_decomposition(); + + GridParallelRNG pRNG_f(Grid_f); + GridParallelRNG pRNG_c(Grid_c); + + std::vector seeds({1, 2, 3, 4}); + + pRNG_f.SeedFixedIntegers(seeds); + pRNG_c.SeedFixedIntegers(seeds); + + ///////////////////////////////////////////////////////////////////////////// + // Setup of Dirac Matrix and Operator // + ///////////////////////////////////////////////////////////////////////////// + + LatticeGaugeField Umu(Grid_f); SU3::HotConfiguration(pRNG_f, Umu); + + RealD checkTolerance = (getPrecision::value == 1) ? 1e-7 : 1e-15; + + RealD mass = -0.30; + RealD csw = 1.9192; + + WilsonCloverFermionR Dwc(Umu, *Grid_f, *RBGrid_f, mass, csw, csw); + MdagMLinearOperator MdagMOp_Dwc(Dwc); + + ///////////////////////////////////////////////////////////////////////////// + // Type definitions // + ///////////////////////////////////////////////////////////////////////////// + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseDiracMatrix; + typedef CoarseDiracMatrix::CoarseVector CoarseVector; + + ///////////////////////////////////////////////////////////////////////////// + // Setup of Aggregation // + ///////////////////////////////////////////////////////////////////////////// + + Aggregates Aggs(Grid_c, Grid_f, 0); + { + LatticeFermion tmp(Aggs.subspace[0].Grid()); + for(int n = 0; n < nb; n++) { + gaussian(pRNG_f, Aggs.subspace[n]); + G5C(tmp, Aggs.subspace[n]); + axpby(Aggs.subspace[n + nb], 0.5, -0.5, Aggs.subspace[n], tmp); + axpby(Aggs.subspace[n], 0.5, 0.5, Aggs.subspace[n], tmp); + } + } + + ///////////////////////////////////////////////////////////////////////////// + // Setup of CoarsenedMatrix and Operator // + ///////////////////////////////////////////////////////////////////////////// + + const int hermitian = 0; + CoarseDiracMatrix Dc(*Grid_c, *RBGrid_c, hermitian); + Dc.CoarsenOperator(Grid_f, MdagMOp_Dwc, Aggs); + MdagMLinearOperator MdagMOp_Dc(Dc); + + ///////////////////////////////////////////////////////////////////////////// + // Setup vectors used in all tests // + ///////////////////////////////////////////////////////////////////////////// + + CoarseVector src(Grid_c); random(pRNG_c, src); + CoarseVector diff(Grid_c); diff = Zero(); + + ///////////////////////////////////////////////////////////////////////////// + // Start of tests // + ///////////////////////////////////////////////////////////////////////////// + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Dhop + Mdiag = Munprec" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi(Grid_c); chi = Zero(); + CoarseVector res(Grid_c); res = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + + Dc.Mdiag(src, phi); std::cout << GridLogMessage << "Applied Mdiag" << std::endl; + Dc.Dhop(src, chi, DaggerNo); std::cout << GridLogMessage << "Applied Dhop" << std::endl; + Dc.M(src, ref); std::cout << GridLogMessage << "Applied M" << std::endl; + + res = phi + chi; + + diff = ref - res; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Munprec), norm2(Dhop + Mdiag), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(res) << " " << absDev << " " << relDev << " -> check " + << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Meo + Moe = Dhop" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector src_e(RBGrid_c); src_e = Zero(); + CoarseVector src_o(RBGrid_c); src_o = Zero(); + CoarseVector res_e(RBGrid_c); res_e = Zero(); + CoarseVector res_o(RBGrid_c); res_o = Zero(); + CoarseVector res(Grid_c); res = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd, src_o, src); + + Dc.Meooe(src_e, res_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + Dc.Meooe(src_o, res_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.Dhop(src, ref, DaggerNo); std::cout << GridLogMessage << "Applied Dhop" << std::endl; + + setCheckerboard(res, res_o); + setCheckerboard(res, res_e); + + diff = ref - res; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Dhop), norm2(Meo + Moe), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(res) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test |(Im(v^dag M^dag M v)| = 0" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.M(src, tmp); std::cout << GridLogMessage << "Applied M" << std::endl; + Dc.Mdag(tmp, phi); std::cout << GridLogMessage << "Applied Mdag" << std::endl; + + std::cout << GridLogMessage << "src = " << norm2(src) << " tmp = " << norm2(tmp) << " phi = " << norm2(phi) << std::endl; + + ComplexD dot = innerProduct(src, phi); + + auto relDev = abs(imag(dot)) / abs(real(dot)); + std::cout << GridLogMessage << "Re(v^dag M^dag M v), Im(v^dag M^dag M v), rel.deviation: " + << real(dot) << " " << imag(dot) << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test |(Im(v^dag Mooee^dag Mooee v)| = 0 (full grid)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.Mooee(src, tmp); std::cout << GridLogMessage << "Applied Mooee" << std::endl; + Dc.MooeeDag(tmp, phi); std::cout << GridLogMessage << "Applied MooeeDag" << std::endl; + + ComplexD dot = innerProduct(src, phi); + + auto relDev = abs(imag(dot)) / abs(real(dot)); + std::cout << GridLogMessage << "Re(v^dag Mooee^dag Mooee v), Im(v^dag Mooee^dag Mooee v), rel.deviation: " + << real(dot) << " " << imag(dot) << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeInv Mooee = 1 (full grid)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.Mooee(src, tmp); std::cout << GridLogMessage << "Applied Mooee" << std::endl; + Dc.MooeeInv(tmp, phi); std::cout << GridLogMessage << "Applied MooeeInv" << std::endl; + + diff = src - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(src); + std::cout << GridLogMessage << "norm2(src), norm2(MooeeInv Mooee src), abs. deviation, rel. deviation: " + << norm2(src) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MeooeDagger is the dagger of Meooe by requiring" << std::endl; + std::cout << GridLogMessage << "= < phi | Meooe | chi > * = < chi | Meooe^dag| phi>" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + // clang-format off + CoarseVector phi(Grid_c); random(pRNG_c, phi); + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector dchi_e(RBGrid_c); dchi_e = Zero(); + CoarseVector dchi_o(RBGrid_c); dchi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector dphi_e(RBGrid_c); dphi_e = Zero(); + CoarseVector dphi_o(RBGrid_c); dphi_o = Zero(); + // clang-format on + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + + Dc.Meooe(chi_e, dchi_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + Dc.Meooe(chi_o, dchi_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.MeooeDag(phi_e, dphi_o); std::cout << GridLogMessage << "Applied MeoDag" << std::endl; + Dc.MeooeDag(phi_o, dphi_e); std::cout << GridLogMessage << "Applied MoeDag" << std::endl; + + ComplexD phiDchi_e = innerProduct(phi_e, dchi_e); + ComplexD phiDchi_o = innerProduct(phi_o, dchi_o); + ComplexD chiDphi_e = innerProduct(chi_e, dphi_e); + ComplexD chiDphi_o = innerProduct(chi_o, dphi_o); + + std::cout << GridLogDebug << "norm dchi_e = " << norm2(dchi_e) << " norm dchi_o = " << norm2(dchi_o) << " norm dphi_e = " << norm2(dphi_e) + << " norm dphi_o = " << norm2(dphi_e) << std::endl; + + std::cout << GridLogMessage << "e " << phiDchi_e << " " << chiDphi_e << std::endl; + std::cout << GridLogMessage << "o " << phiDchi_o << " " << chiDphi_o << std::endl; + + std::cout << GridLogMessage << "phiDchi_e - conj(chiDphi_o) " << phiDchi_e - conj(chiDphi_o) << std::endl; + std::cout << GridLogMessage << "phiDchi_o - conj(chiDphi_e) " << phiDchi_o - conj(chiDphi_e) << std::endl; + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeInv Mooee = 1 (checkerboards separately)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector tmp_e(RBGrid_c); tmp_e = Zero(); + CoarseVector tmp_o(RBGrid_c); tmp_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, tmp_e, tmp); + pickCheckerboard(Odd, tmp_o, tmp); + + Dc.Mooee(chi_e, tmp_e); std::cout << GridLogMessage << "Applied Mee" << std::endl; + Dc.MooeeInv(tmp_e, phi_e); std::cout << GridLogMessage << "Applied MeeInv" << std::endl; + Dc.Mooee(chi_o, tmp_o); std::cout << GridLogMessage << "Applied Moo" << std::endl; + Dc.MooeeInv(tmp_o, phi_o); std::cout << GridLogMessage << "Applied MooInv" << std::endl; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + diff = chi - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(chi); + std::cout << GridLogMessage << "norm2(chi), norm2(MeeInv Mee chi), abs. deviation, rel. deviation: " + << norm2(chi) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeDag MooeeInvDag = 1 (checkerboards separately)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector tmp_e(RBGrid_c); tmp_e = Zero(); + CoarseVector tmp_o(RBGrid_c); tmp_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, tmp_e, tmp); + pickCheckerboard(Odd, tmp_o, tmp); + + Dc.MooeeDag(chi_e, tmp_e); std::cout << GridLogMessage << "Applied MeeDag" << std::endl; + Dc.MooeeInvDag(tmp_e, phi_e); std::cout << GridLogMessage << "Applied MeeInvDag" << std::endl; + Dc.MooeeDag(chi_o, tmp_o); std::cout << GridLogMessage << "Applied MooDag" << std::endl; + Dc.MooeeInvDag(tmp_o, phi_o); std::cout << GridLogMessage << "Applied MooInvDag" << std::endl; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + diff = chi - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(chi); + std::cout << GridLogMessage << "norm2(chi), norm2(MeeDag MeeInvDag chi), abs. deviation, rel. deviation: " + << norm2(chi) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Meo + Moe + Moo + Mee = Munprec" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); chi = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + CoarseVector src_e(RBGrid_c); src_e = Zero(); + CoarseVector src_o(RBGrid_c); src_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd, src_o, src); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + + // M phi = (Mooee src_e + Meooe src_o , Mooee src_o + Meooe src_e) + + Dc.M(src, ref); // Reference result from the unpreconditioned operator + + // EO matrix + Dc.Mooee(src_e, chi_e); std::cout << GridLogMessage << "Applied Mee" << std::endl; + Dc.Mooee(src_o, chi_o); std::cout << GridLogMessage << "Applied Moo" << std::endl; + Dc.Meooe(src_o, phi_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.Meooe(src_e, phi_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + + phi_o += chi_o; + phi_e += chi_e; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + std::cout << GridLogDebug << "norm phi_e = " << norm2(phi_e) << " norm phi_o = " << norm2(phi_o) << " norm phi = " << norm2(phi) << std::endl; + + diff = ref - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Dunprec), norm2(Deoprec), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MpcDagMpc is hermitian" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector phi(Grid_c); random(pRNG_c, phi); + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector dchi_e(RBGrid_c); dchi_e = Zero(); + CoarseVector dchi_o(RBGrid_c); dchi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector dphi_e(RBGrid_c); dphi_e = Zero(); + CoarseVector dphi_o(RBGrid_c); dphi_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + + SchurDiagMooeeOperator HermOpEO(Dc); + + HermOpEO.MpcDagMpc(chi_e, dchi_e); std::cout << GridLogMessage << "Applied MpcDagMpc to chi_e" << std::endl; + HermOpEO.MpcDagMpc(chi_o, dchi_o); std::cout << GridLogMessage << "Applied MpcDagMpc to chi_o" << std::endl; + HermOpEO.MpcDagMpc(phi_e, dphi_e); std::cout << GridLogMessage << "Applied MpcDagMpc to phi_e" << std::endl; + HermOpEO.MpcDagMpc(phi_o, dphi_o); std::cout << GridLogMessage << "Applied MpcDagMpc to phi_o" << std::endl; + + ComplexD phiDchi_e = innerProduct(phi_e, dchi_e); + ComplexD phiDchi_o = innerProduct(phi_o, dchi_o); + ComplexD chiDphi_e = innerProduct(chi_e, dphi_e); + ComplexD chiDphi_o = innerProduct(chi_o, dphi_o); + + std::cout << GridLogMessage << "e " << phiDchi_e << " " << chiDphi_e << std::endl; + std::cout << GridLogMessage << "o " << phiDchi_o << " " << chiDphi_o << std::endl; + + std::cout << GridLogMessage << "phiDchi_e - conj(chiDphi_e) " << phiDchi_e - conj(chiDphi_e) << std::endl; + std::cout << GridLogMessage << "phiDchi_o - conj(chiDphi_o) " << phiDchi_o - conj(chiDphi_o) << std::endl; + } + + Grid_finalize(); +}