diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 129fd582..00000000 --- a/.travis.yml +++ /dev/null @@ -1,61 +0,0 @@ -language: cpp - -cache: - directories: - - clang - -matrix: - include: - - os: osx - osx_image: xcode8.3 - compiler: clang - env: PREC=single - - os: osx - osx_image: xcode8.3 - compiler: clang - env: PREC=double - -before_install: - - export GRIDDIR=`pwd` - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi - -install: - - export CWD=`pwd` - - echo $CWD - - export CC=$CC$VERSION - - export CXX=$CXX$VERSION - - echo $PATH - - which autoconf - - autoconf --version - - which automake - - automake --version - - which $CC - - $CC --version - - which $CXX - - $CXX --version - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi - -script: - - ./bootstrap.sh - - mkdir build - - cd build - - mkdir lime - - cd lime - - mkdir build - - cd build - - wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz - - tar xf lime-1.3.2.tar.gz - - cd lime-1.3.2 - - ./configure --prefix=$CWD/build/lime/install - - make -j4 - - make install - - cd $CWD/build - - ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF} - - make -j4 - - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals - - make check diff --git a/Grid/DisableWarnings.h b/Grid/DisableWarnings.h index 8ea219fb..4bd1edd0 100644 --- a/Grid/DisableWarnings.h +++ b/Grid/DisableWarnings.h @@ -37,7 +37,9 @@ directory #endif //disables and intel compiler specific warning (in json.hpp) +#ifdef __ICC #pragma warning disable 488 +#endif #ifdef __NVCC__ //disables nvcc specific warning in json.hpp diff --git a/Grid/GridStd.h b/Grid/GridStd.h index ecb561ea..28f6bc46 100644 --- a/Grid/GridStd.h +++ b/Grid/GridStd.h @@ -28,4 +28,7 @@ /////////////////// #include "Config.h" +#ifdef TOFU +#undef GRID_COMMS_THREADS +#endif #endif /* GRID_STD_H */ diff --git a/Grid/Makefile.am b/Grid/Makefile.am index f1fa462e..7c3c151b 100644 --- a/Grid/Makefile.am +++ b/Grid/Makefile.am @@ -21,6 +21,7 @@ if BUILD_HDF5 extra_headers+=serialisation/Hdf5Type.h endif + all: version-cache Version.h version-cache: @@ -53,6 +54,19 @@ Version.h: version-cache include Make.inc include Eigen.inc +extra_sources+=$(WILS_FERMION_FILES) +extra_sources+=$(STAG_FERMION_FILES) +if BUILD_ZMOBIUS + extra_sources+=$(ZWILS_FERMION_FILES) +endif +if BUILD_GPARITY + extra_sources+=$(GP_FERMION_FILES) +endif +if BUILD_FERMION_REPS + extra_sources+=$(ADJ_FERMION_FILES) + extra_sources+=$(TWOIND_FERMION_FILES) +endif + lib_LIBRARIES = libGrid.a CCFILES += $(extra_sources) diff --git a/Grid/algorithms/CoarsenedMatrix.h b/Grid/algorithms/CoarsenedMatrix.h index 8d184aea..b9594678 100644 --- a/Grid/algorithms/CoarsenedMatrix.h +++ b/Grid/algorithms/CoarsenedMatrix.h @@ -31,6 +31,7 @@ Author: paboyle #ifndef GRID_ALGORITHM_COARSENED_MATRIX_H #define GRID_ALGORITHM_COARSENED_MATRIX_H +#include // needed for Dagger(Yes|No), Inverse(Yes|No) NAMESPACE_BEGIN(Grid); @@ -59,12 +60,14 @@ inline void blockMaskedInnerProduct(Lattice &CoarseInner, class Geometry { public: int npoint; + int base; std::vector directions ; std::vector displacements; + std::vector points_dagger; Geometry(int _d) { - int base = (_d==5) ? 1:0; + base = (_d==5) ? 1:0; // make coarse grid stencil for 4d , not 5d if ( _d==5 ) _d=4; @@ -72,16 +75,51 @@ public: npoint = 2*_d+1; directions.resize(npoint); displacements.resize(npoint); + points_dagger.resize(npoint); for(int d=0;d<_d;d++){ directions[d ] = d+base; directions[d+_d] = d+base; displacements[d ] = +1; displacements[d+_d]= -1; + points_dagger[d ] = d+_d; + points_dagger[d+_d] = d; } directions [2*_d]=0; displacements[2*_d]=0; + points_dagger[2*_d]=2*_d; } + int point(int dir, int disp) { + assert(disp == -1 || disp == 0 || disp == 1); + assert(base+0 <= dir && dir < base+4); + + // directions faster index = new indexing + // 4d (base = 0): + // point 0 1 2 3 4 5 6 7 8 + // dir 0 1 2 3 0 1 2 3 0 + // disp +1 +1 +1 +1 -1 -1 -1 -1 0 + // 5d (base = 1): + // point 0 1 2 3 4 5 6 7 8 + // dir 1 2 3 4 1 2 3 4 0 + // disp +1 +1 +1 +1 -1 -1 -1 -1 0 + + // displacements faster index = old indexing + // 4d (base = 0): + // point 0 1 2 3 4 5 6 7 8 + // dir 0 0 1 1 2 2 3 3 0 + // disp +1 -1 +1 -1 +1 -1 +1 -1 0 + // 5d (base = 1): + // point 0 1 2 3 4 5 6 7 8 + // dir 1 1 2 2 3 3 4 4 0 + // disp +1 -1 +1 -1 +1 -1 +1 -1 0 + + if(dir == 0 and disp == 0) + return 8; + else // New indexing + return (1 - disp) / 2 * 4 + dir - base; + // else // Old indexing + // return (4 * (dir - base) + 1 - disp) / 2; + } }; template @@ -258,7 +296,7 @@ public: // Fine Object == (per site) type of fine field // nbasis == number of deflation vectors template -class CoarsenedMatrix : public SparseMatrixBase > > { +class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase > > { public: typedef iVector siteVector; @@ -268,33 +306,59 @@ public: typedef iMatrix Cobj; typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice FineField; + typedef CoarseVector FermionField; + + // enrich interface, use default implementation as in FermionOperator /////// + void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; } + void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; } + void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; } + void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; } + void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; }; + void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; }; //////////////////// // Data members //////////////////// Geometry geom; GridBase * _grid; + GridBase* _cbgrid; int hermitian; CartesianStencil Stencil; + CartesianStencil StencilEven; + CartesianStencil StencilOdd; std::vector A; - + std::vector Aeven; + std::vector Aodd; + + CoarseMatrix AselfInv; + CoarseMatrix AselfInvEven; + CoarseMatrix AselfInvOdd; + + Vector dag_factor; + /////////////////////// // Interface /////////////////////// GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know + GridBase * RedBlackGrid() { return _cbgrid; }; + + int ConstEE() { return 0; } void M (const CoarseVector &in, CoarseVector &out) { conformable(_grid,in.Grid()); conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); SimpleCompressor compressor; Stencil.HaloExchange(in,compressor); autoView( in_v , in, AcceleratorRead); autoView( out_v , out, AcceleratorWrite); + autoView( Stencil_v , Stencil, AcceleratorRead); + auto& geom_v = geom; typedef LatticeView Aview; Vector AcceleratorViewContainer; @@ -316,14 +380,14 @@ public: int ptype; StencilEntry *SE; - for(int point=0;point_is_local) { nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); } acceleratorSynchronise(); @@ -344,12 +408,72 @@ public: return M(in,out); } else { // corresponds to Galerkin coarsening - CoarseVector tmp(Grid()); - G5C(tmp, in); - M(tmp, out); - G5C(out, out); + return MdagNonHermitian(in, out); } }; + + void MdagNonHermitian(const CoarseVector &in, CoarseVector &out) + { + conformable(_grid,in.Grid()); + conformable(in.Grid(),out.Grid()); + out.Checkerboard() = in.Checkerboard(); + + SimpleCompressor compressor; + + Stencil.HaloExchange(in,compressor); + autoView( in_v , in, AcceleratorRead); + autoView( out_v , out, AcceleratorWrite); + autoView( Stencil_v , Stencil, AcceleratorRead); + auto& geom_v = geom; + typedef LatticeView Aview; + + Vector AcceleratorViewContainer; + + for(int p=0;poSites(); + + Vector points(geom.npoint, 0); + for(int p=0; poSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb compressor; @@ -359,6 +483,7 @@ public: { conformable(_grid,in.Grid()); conformable(_grid,out.Grid()); + out.Checkerboard() = in.Checkerboard(); typedef LatticeView Aview; Vector AcceleratorViewContainer; @@ -367,6 +492,7 @@ public: autoView( out_v , out, AcceleratorWrite); autoView( in_v , in, AcceleratorRead); + autoView( Stencil_v , Stencil, AcceleratorRead); const int Nsimd = CComplex::Nsimd(); typedef decltype(coalescedRead(in_v[0])) calcVector; @@ -380,12 +506,12 @@ public: int ptype; StencilEntry *SE; - SE=Stencil.GetEntry(ptype,point,ss); + SE=Stencil_v.GetEntry(ptype,point,ss); if(SE->_is_local) { nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); } else { - nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]); + nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); } acceleratorSynchronise(); @@ -413,34 +539,7 @@ public: this->MdirComms(in); - int ndim = in.Grid()->Nd(); - - ////////////// - // 4D action like wilson - // 0+ => 0 - // 0- => 1 - // 1+ => 2 - // 1- => 3 - // etc.. - ////////////// - // 5D action like DWF - // 1+ => 0 - // 1- => 1 - // 2+ => 2 - // 2- => 3 - // etc.. - auto point = [dir, disp, ndim](){ - if(dir == 0 and disp == 0) - return 8; - else if ( ndim==4 ) { - return (4 * dir + 1 - disp) / 2; - } else { - return (4 * (dir-1) + 1 - disp) / 2; - } - }(); - - MdirCalc(in,out,point); - + MdirCalc(in,out,geom.point(dir,disp)); }; void Mdiag(const CoarseVector &in, CoarseVector &out) @@ -449,23 +548,296 @@ public: MdirCalc(in, out, point); // No comms }; - - CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) : + void Mooee(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerNo, InverseNo); + } + void MooeeInv(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerNo, InverseYes); + } + + void MooeeDag(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerYes, InverseNo); + } + + void MooeeInvDag(const CoarseVector &in, CoarseVector &out) { + MooeeInternal(in, out, DaggerYes, InverseYes); + } + + void Meooe(const CoarseVector &in, CoarseVector &out) { + if(in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerNo); + } else { + DhopOE(in, out, DaggerNo); + } + } + + void MeooeDag(const CoarseVector &in, CoarseVector &out) { + if(in.Checkerboard() == Odd) { + DhopEO(in, out, DaggerYes); + } else { + DhopOE(in, out, DaggerYes); + } + } + + void Dhop(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _grid); // verifies full grid + conformable(in.Grid(), out.Grid()); + + out.Checkerboard() = in.Checkerboard(); + + DhopInternal(Stencil, A, in, out, dag); + } + + void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Even); + out.Checkerboard() = Odd; + + DhopInternal(StencilEven, Aodd, in, out, dag); + } + + void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) { + conformable(in.Grid(), _cbgrid); // verifies half grid + conformable(in.Grid(), out.Grid()); // drops the cb check + + assert(in.Checkerboard() == Odd); + out.Checkerboard() = Even; + + DhopInternal(StencilOdd, Aeven, in, out, dag); + } + + void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) { + out.Checkerboard() = in.Checkerboard(); + assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); + + CoarseMatrix *Aself = nullptr; + if(in.Grid()->_isCheckerBoarded) { + if(in.Checkerboard() == Odd) { + Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1]; + DselfInternal(StencilOdd, *Aself, in, out, dag); + } else { + Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1]; + DselfInternal(StencilEven, *Aself, in, out, dag); + } + } else { + Aself = (inv) ? &AselfInv : &A[geom.npoint-1]; + DselfInternal(Stencil, *Aself, in, out, dag); + } + assert(Aself != nullptr); + } + + void DselfInternal(CartesianStencil &st, CoarseMatrix &a, + const CoarseVector &in, CoarseVector &out, int dag) { + int point = geom.npoint-1; + autoView( out_v, out, AcceleratorWrite); + autoView( in_v, in, AcceleratorRead); + autoView( st_v, st, AcceleratorRead); + autoView( a_v, a, AcceleratorRead); + + const int Nsimd = CComplex::Nsimd(); + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + + RealD* dag_factor_p = &dag_factor[0]; + + if(dag) { + accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + SE=st_v.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bboSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + SE=st_v.GetEntry(ptype,point,ss); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb &st, std::vector &a, + const CoarseVector &in, CoarseVector &out, int dag) { + SimpleCompressor compressor; + + st.HaloExchange(in,compressor); + autoView( in_v, in, AcceleratorRead); + autoView( out_v, out, AcceleratorWrite); + autoView( st_v , st, AcceleratorRead); + typedef LatticeView Aview; + + // determine in what order we need the points + int npoint = geom.npoint-1; + Vector points(npoint, 0); + for(int p=0; p AcceleratorViewContainer; + for(int p=0;poSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bboSites()*nbasis, Nsimd, { + int ss = sss/nbasis; + int b = sss%nbasis; + calcComplex res = Zero(); + calcVector nbr; + int ptype; + StencilEntry *SE; + + for(int p=0;p_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(st_v.CommBuf()[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb > &linop, Aggregation & Subspace) { typedef Lattice FineComplexField; typedef typename Fobj::scalar_type scalar_type; + std::cout << GridLogMessage<< "CoarsenMatrix "<< std::endl; + FineComplexField one(FineGrid); one=scalar_type(1.0,0.0); FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0); @@ -496,11 +868,13 @@ public: CoarseScalar InnerProd(Grid()); + std::cout << GridLogMessage<< "CoarsenMatrix Orthog "<< std::endl; // Orthogonalise the subblocks over the basis blockOrthogonalise(InnerProd,Subspace.subspace); // Compute the matrix elements of linop between this orthonormal // set of vectors. + std::cout << GridLogMessage<< "CoarsenMatrix masks "<< std::endl; int self_stencil=-1; for(int p=0;poSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); + if ( hermitian && (disp==-1) ) { + for(int pp=0;pp = * + int dirp = geom.directions[pp]; + int dispp = geom.displacements[pp]; + if ( (dirp==dir) && (dispp==1) ){ + auto sft = conjugate(Cshift(oZProj,dir,1)); + autoView( sft_v , sft , AcceleratorWrite); + autoView( A_pp , A[pp], AcceleratorWrite); + accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); }); + } + } + } } } @@ -606,28 +992,54 @@ public: } if(hermitian) { std::cout << GridLogMessage << " ForceHermitian, new code "<lSites(); + + typedef typename Cobj::scalar_object scalar_object; + + autoView(Aself_v, A[geom.npoint-1], CpuRead); + autoView(AselfInv_v, AselfInv, CpuWrite); + thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke + Eigen::MatrixXcd selfLinkEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis); + Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis); + + scalar_object selfLink = Zero(); + scalar_object selfLinkInv = Zero(); + + Coordinate lcoor; + + Grid()->LocalIndexToLocalCoor(site, lcoor); + peekLocalSite(selfLink, Aself_v, lcoor); + + for (int i = 0; i < nbasis; ++i) + for (int j = 0; j < nbasis; ++j) + selfLinkEigen(i, j) = static_cast(TensorRemove(selfLink(i, j))); + + selfLinkInvEigen = selfLinkEigen.inverse(); + + for(int i = 0; i < nbasis; ++i) + for(int j = 0; j < nbasis; ++j) + selfLinkInv(i, j) = selfLinkInvEigen(i, j); + + pokeLocalSite(selfLinkInv, AselfInv_v, lcoor); + }); + } + + void FillHalfCbs() { + std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl; + for(int p = 0; p < geom.npoint; ++p) { + pickCheckerboard(Even, Aeven[p], A[p]); + pickCheckerboard(Odd, Aodd[p], A[p]); } + pickCheckerboard(Even, AselfInvEven, AselfInv); + pickCheckerboard(Odd, AselfInvOdd, AselfInv); } }; diff --git a/Grid/allocator/AlignedAllocator.cc b/Grid/allocator/AlignedAllocator.cc deleted file mode 100644 index 0d1707d9..00000000 --- a/Grid/allocator/AlignedAllocator.cc +++ /dev/null @@ -1,67 +0,0 @@ -#include -#include - -NAMESPACE_BEGIN(Grid); - -MemoryStats *MemoryProfiler::stats = nullptr; -bool MemoryProfiler::debug = false; - -void check_huge_pages(void *Buf,uint64_t BYTES) -{ -#ifdef __linux__ - int fd = open("/proc/self/pagemap", O_RDONLY); - assert(fd >= 0); - const int page_size = 4096; - uint64_t virt_pfn = (uint64_t)Buf / page_size; - off_t offset = sizeof(uint64_t) * virt_pfn; - uint64_t npages = (BYTES + page_size-1) / page_size; - uint64_t pagedata[npages]; - uint64_t ret = lseek(fd, offset, SEEK_SET); - assert(ret == offset); - ret = ::read(fd, pagedata, sizeof(uint64_t)*npages); - assert(ret == sizeof(uint64_t) * npages); - int nhugepages = npages / 512; - int n4ktotal, nnothuge; - n4ktotal = 0; - nnothuge = 0; - for (int i = 0; i < nhugepages; ++i) { - uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size; - for (int j = 0; j < 512; ++j) { - uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size; - ++n4ktotal; - if (pageaddr != baseaddr + j * page_size) - ++nnothuge; - } - } - int rank = CartesianCommunicator::RankWorld(); - printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge); -#endif -} - -std::string sizeString(const size_t bytes) -{ - constexpr unsigned int bufSize = 256; - const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"}; - char buf[256]; - size_t s = 0; - double count = bytes; - - while (count >= 1024 && s < 7) - { - s++; - count /= 1024; - } - if (count - floor(count) == 0.0) - { - snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]); - } - else - { - snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]); - } - - return std::string(buf); -} - -NAMESPACE_END(Grid); - diff --git a/Grid/allocator/AlignedAllocator.h b/Grid/allocator/AlignedAllocator.h index 249732fb..91622789 100644 --- a/Grid/allocator/AlignedAllocator.h +++ b/Grid/allocator/AlignedAllocator.h @@ -165,9 +165,18 @@ template inline bool operator!=(const devAllocator<_Tp>&, const d //////////////////////////////////////////////////////////////////////////////// // Template typedefs //////////////////////////////////////////////////////////////////////////////// -//template using commAllocator = devAllocator; -template using Vector = std::vector >; +#ifdef ACCELERATOR_CSHIFT +// Cshift on device +template using cshiftAllocator = devAllocator; +#else +// Cshift on host +template using cshiftAllocator = std::allocator; +#endif + +template using Vector = std::vector >; +template using stencilVector = std::vector >; template using commVector = std::vector >; +template using cshiftVector = std::vector >; NAMESPACE_END(Grid); diff --git a/Grid/allocator/MemoryManager.h b/Grid/allocator/MemoryManager.h index aac13aee..25c5b5f5 100644 --- a/Grid/allocator/MemoryManager.h +++ b/Grid/allocator/MemoryManager.h @@ -34,8 +34,6 @@ NAMESPACE_BEGIN(Grid); // Move control to configure.ac and Config.h? -#define ALLOCATION_CACHE -#define GRID_ALLOC_ALIGN (2*1024*1024) #define GRID_ALLOC_SMALL_LIMIT (4096) /*Pinning pages is costly*/ diff --git a/Grid/allocator/MemoryManagerCache.cc b/Grid/allocator/MemoryManagerCache.cc index 5dd7575e..275ed5e0 100644 --- a/Grid/allocator/MemoryManagerCache.cc +++ b/Grid/allocator/MemoryManagerCache.cc @@ -1,11 +1,12 @@ #include - #ifndef GRID_UVM #warning "Using explicit device memory copies" NAMESPACE_BEGIN(Grid); +//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout); #define dprintf(...) + //////////////////////////////////////////////////////////// // For caching copies of data on device //////////////////////////////////////////////////////////// @@ -103,7 +104,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - // dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); assert(AccCache.CpuPtr!=(uint64_t)NULL); @@ -111,7 +112,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache) AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - // dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); + dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -125,7 +126,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) /////////////////////////////////////////////////////////////////////////// assert(AccCache.state!=Empty); - // dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); + dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); assert(AccCache.accLock==0); assert(AccCache.cpuLock==0); if(AccCache.state==AccDirty) { @@ -136,7 +137,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache) AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); DeviceBytes -=AccCache.bytes; LRUremove(AccCache); - // dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); + dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); } uint64_t CpuPtr = AccCache.CpuPtr; EntryErase(CpuPtr); @@ -149,7 +150,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache) assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); - // dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); DeviceToHostBytes+=AccCache.bytes; DeviceToHostXfer++; AccCache.state=Consistent; @@ -164,7 +165,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache) AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); DeviceBytes+=AccCache.bytes; } - // dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); + dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); HostToDeviceBytes+=AccCache.bytes; HostToDeviceXfer++; @@ -227,18 +228,24 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,hint); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - + if (!AccCache.AccPtr) { + EvictVictims(bytes); + } assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); assert(AccCache.cpuLock==0); // Programming error if(AccCache.state!=Empty) { + dprintf("ViewOpen found entry %llx %llx : %lld %lld\n", + (uint64_t)AccCache.CpuPtr, + (uint64_t)CpuPtr, + (uint64_t)AccCache.bytes, + (uint64_t)bytes); assert(AccCache.CpuPtr == CpuPtr); assert(AccCache.bytes ==bytes); } @@ -285,21 +292,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod AccCache.state = Consistent; // CpuDirty + AccRead => Consistent } AccCache.accLock++; - // printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); } else if(AccCache.state==Consistent) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty else AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.accLock++; - // printf("Consistent entry into device accLock %d\n",AccCache.accLock); + dprintf("Consistent entry into device accLock %d\n",AccCache.accLock); } else if(AccCache.state==AccDirty) { if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty else AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.accLock++; - // printf("AccDirty entry into device accLock %d\n",AccCache.accLock); + dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock); } else { assert(0); } @@ -361,13 +368,16 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V // Find if present, otherwise get or force an empty //////////////////////////////////////////////////////////////////////////// if ( EntryPresent(CpuPtr)==0 ){ - EvictVictims(bytes); EntryCreate(CpuPtr,bytes,mode,transient); } auto AccCacheIterator = EntryLookup(CpuPtr); auto & AccCache = AccCacheIterator->second; - + + if (!AccCache.AccPtr) { + EvictVictims(bytes); + } + assert((mode==CpuRead)||(mode==CpuWrite)); assert(AccCache.accLock==0); // Programming error diff --git a/Grid/allocator/MemoryManagerShared.cc b/Grid/allocator/MemoryManagerShared.cc index 537f7c32..3f165007 100644 --- a/Grid/allocator/MemoryManagerShared.cc +++ b/Grid/allocator/MemoryManagerShared.cc @@ -1,7 +1,6 @@ #include #ifdef GRID_UVM -#warning "Grid is assuming unified virtual memory address space" NAMESPACE_BEGIN(Grid); ///////////////////////////////////////////////////////////////////////////////// // View management is 1:1 address space mapping diff --git a/Grid/cartesian/Cartesian_red_black.h b/Grid/cartesian/Cartesian_red_black.h index b71981f5..092d4910 100644 --- a/Grid/cartesian/Cartesian_red_black.h +++ b/Grid/cartesian/Cartesian_red_black.h @@ -36,7 +36,7 @@ static const int CbBlack=1; static const int Even =CbRed; static const int Odd =CbBlack; -accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk) +accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex,const Coordinate &rdim,const Coordinate &chk_dim_msk) { int nd=rdim.size(); Coordinate coor(nd); diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index bb06d43f..a15f9789 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -1,4 +1,3 @@ - /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -108,6 +107,8 @@ public: //////////////////////////////////////////////////////////// // Reduction //////////////////////////////////////////////////////////// + void GlobalMax(RealD &); + void GlobalMax(RealF &); void GlobalSum(RealF &); void GlobalSumVector(RealF *,int N); void GlobalSum(RealD &); diff --git a/Grid/communicator/Communicator_mpi3.cc b/Grid/communicator/Communicator_mpi3.cc index 83f71233..5713fe35 100644 --- a/Grid/communicator/Communicator_mpi3.cc +++ b/Grid/communicator/Communicator_mpi3.cc @@ -44,7 +44,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) MPI_Initialized(&flag); // needed to coexist with other libs apparently if ( !flag ) { -#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori +#ifndef GRID_COMMS_THREADS nCommThreads=1; // wrong results here too // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs @@ -275,6 +275,16 @@ void CartesianCommunicator::GlobalXOR(uint64_t &u){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); assert(ierr==0); } +void CartesianCommunicator::GlobalMax(float &f) +{ + int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator); + assert(ierr==0); +} +void CartesianCommunicator::GlobalMax(double &d) +{ + int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); + assert(ierr==0); +} void CartesianCommunicator::GlobalSum(float &f){ int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); assert(ierr==0); @@ -358,16 +368,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector +Author: Christoph Lehner This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -34,6 +35,9 @@ Author: Peter Boyle #endif #ifdef GRID_HIP #include +#endif +#ifdef GRID_SYCl + #endif NAMESPACE_BEGIN(Grid); @@ -69,6 +73,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) WorldNodes = WorldSize/WorldShmSize; assert( (WorldNodes * WorldShmSize) == WorldSize ); + // FIXME: Check all WorldShmSize are the same ? ///////////////////////////////////////////////////////////////////// @@ -169,6 +174,23 @@ static inline int divides(int a,int b) } void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) { + //////////////////////////////////////////////////////////////// + // Allow user to configure through environment variable + //////////////////////////////////////////////////////////////// + char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str()); + if ( str ) { + std::vector IntShmDims; + GridCmdOptionIntVector(std::string(str),IntShmDims); + assert(IntShmDims.size() == WorldDims.size()); + long ShmSize = 1; + for (int dim=0;dim(theGridAccelerator->get_device()); + auto zeContext= cl::sycl::get_native(theGridAccelerator->get_context()); + ze_device_mem_alloc_desc_t zeDesc = {}; + zeMemAllocDevice(zeContext,&zeDesc,bytes,2*1024*1024,zeDevice,&ShmCommBuf); + std::cout << WorldRank << header " SharedMemoryMPI.cc zeMemAllocDevice "<< bytes + << "bytes at "<< std::hex<< ShmCommBuf < ranks(size); for(int r=0;r #include NAMESPACE_BEGIN(Grid); +#define header "SharedMemoryNone: " /*Construct from an MPI communicator*/ void GlobalSharedMemory::Init(Grid_MPI_Comm comm) @@ -55,6 +56,38 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M //////////////////////////////////////////////////////////////////////////////////////////// // Hugetlbfs mapping intended, use anonymous mmap //////////////////////////////////////////////////////////////////////////////////////////// +#if 1 +void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) +{ + std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "< > Cshift_table; // Gather for when there is no need to SIMD split /////////////////////////////////////////////////////////////////// template void -Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimension,int plane,int cbmask, int off=0) +Gather_plane_simple (const Lattice &rhs,cshiftVector &buffer,int dimension,int plane,int cbmask, int off=0) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice &rhs,commVector &buffer,int dimen } } { - autoView(rhs_v , rhs, AcceleratorRead); auto buffer_p = & buffer[0]; auto table = &Cshift_table[0]; +#ifdef ACCELERATOR_CSHIFT + autoView(rhs_v , rhs, AcceleratorRead); accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); }); +#else + autoView(rhs_v , rhs, CpuRead); + thread_for(i,ent,{ + buffer_p[table[i].first]=rhs_v[table[i].second]; + }); +#endif } } @@ -103,21 +110,36 @@ Gather_plane_extract(const Lattice &rhs, int n1=rhs.Grid()->_slice_stride[dimension]; if ( cbmask ==0x3){ +#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); - accelerator_for2d(n,e1,b,e2,1,{ + accelerator_for(nn,e1*e2,1,{ + int n = nn%e1; + int b = nn/e1; int o = n*n1; int offset = b+n*e2; vobj temp =rhs_v[so+o+b]; extract(temp,pointers,offset); }); +#else + autoView(rhs_v , rhs, CpuRead); + thread_for2d(n,e1,b,e2,{ + int o = n*n1; + int offset = b+n*e2; + + vobj temp =rhs_v[so+o+b]; + extract(temp,pointers,offset); + }); +#endif } else { - autoView(rhs_v , rhs, AcceleratorRead); - Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate cdm =rhs.Grid()->_checker_dim_mask; std::cout << " Dense packed buffer WARNING " < &rhs, extract(temp,pointers,offset); } }); +#else + autoView(rhs_v , rhs, CpuRead); + thread_for2d(n,e1,b,e2,{ + + Coordinate coor; + + int o=n*n1; + int oindex = o+b; + + int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm); + + int ocb=1<(temp,pointers,offset); + } + }); +#endif } } ////////////////////////////////////////////////////// // Scatter for when there is no need to SIMD split ////////////////////////////////////////////////////// -template void Scatter_plane_simple (Lattice &rhs,commVector &buffer, int dimension,int plane,int cbmask) +template void Scatter_plane_simple (Lattice &rhs,cshiftVector &buffer, int dimension,int plane,int cbmask) { int rd = rhs.Grid()->_rdimensions[dimension]; @@ -182,12 +224,19 @@ template void Scatter_plane_simple (Lattice &rhs,commVector void Scatter_plane_merge(Lattice &rhs,ExtractPointerA int e2=rhs.Grid()->_slice_block[dimension]; if(cbmask ==0x3 ) { - autoView( rhs_v , rhs, AcceleratorWrite); int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension]; - accelerator_for2d(n,e1,b,e2,1,{ +#ifdef ACCELERATOR_CSHIFT + autoView( rhs_v , rhs, AcceleratorWrite); + accelerator_for(nn,e1*e2,1,{ + int n = nn%e1; + int b = nn/e1; int o = n*_slice_stride; int offset = b+n*_slice_block; merge(rhs_v[so+o+b],pointers,offset); }); +#else + autoView( rhs_v , rhs, CpuWrite); + thread_for2d(n,e1,b,e2,{ + int o = n*_slice_stride; + int offset = b+n*_slice_block; + merge(rhs_v[so+o+b],pointers,offset); + }); +#endif } else { // Case of SIMD split AND checker dim cannot currently be hit, except in // Test_cshift_red_black code. - // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME + std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout<<" Unthreaded warning -- buffer is not densely packed ??"< void Copy_plane(Lattice& lhs,const Lattice &rhs } { + auto table = &Cshift_table[0]; +#ifdef ACCELERATOR_CSHIFT autoView(rhs_v , rhs, AcceleratorRead); autoView(lhs_v , lhs, AcceleratorWrite); - auto table = &Cshift_table[0]; accelerator_for(i,ent,vobj::Nsimd(),{ coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); }); +#else + autoView(rhs_v , rhs, CpuRead); + autoView(lhs_v , lhs, CpuWrite); + thread_for(i,ent,{ + lhs_v[table[i].first]=rhs_v[table[i].second]; + }); +#endif } } @@ -324,12 +392,20 @@ template void Copy_plane_permute(Lattice& lhs,const Lattice void Cshift_comms_simd(Lattice& ret,const Lattice void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) { typedef typename vobj::vector_type vector_type; @@ -121,9 +122,9 @@ template void Cshift_comms(Lattice &ret,const Lattice &r assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; - commVector send_buf(buffer_size); - commVector recv_buf(buffer_size); - + static cshiftVector send_buf; send_buf.resize(buffer_size); + static cshiftVector recv_buf; recv_buf.resize(buffer_size); + int cb= (cbmask==0x2)? Odd : Even; int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); @@ -138,7 +139,7 @@ template void Cshift_comms(Lattice &ret,const Lattice &r } else { - int words = send_buf.size(); + int words = buffer_size; if (cbmask != 0x3) words=words>>1; int bytes = words * sizeof(vobj); @@ -150,12 +151,14 @@ template void Cshift_comms(Lattice &ret,const Lattice &r int xmit_to_rank; grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + grid->Barrier(); grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, (void *)&recv_buf[0], recv_from_rank, bytes); + grid->Barrier(); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); @@ -195,8 +198,15 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice_slice_nblock[dimension]*grid->_slice_block[dimension]; // int words = sizeof(vobj)/sizeof(vector_type); - std::vector > send_buf_extract(Nsimd,commVector(buffer_size) ); - std::vector > recv_buf_extract(Nsimd,commVector(buffer_size) ); + static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); + static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); + scalar_object * recv_buf_extract_mpi; + scalar_object * send_buf_extract_mpi; + + for(int s=0;s void Cshift_comms_simd(Lattice &ret,const LatticeShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); - grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0], + grid->Barrier(); + + send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; + recv_buf_extract_mpi = &recv_buf_extract[i][0]; + grid->SendToRecvFrom((void *)send_buf_extract_mpi, xmit_to_rank, - (void *)&recv_buf_extract[i][0], + (void *)recv_buf_extract_mpi, recv_from_rank, bytes); + + grid->Barrier(); + + rpointers[i] = &recv_buf_extract[i][0]; + } else { + rpointers[i] = &send_buf_extract[nbr_lane][0]; + } + + } + Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); + } + +} +#else +template void Cshift_comms(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) +{ + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + GridBase *grid=rhs.Grid(); + Lattice temp(rhs.Grid()); + + int fd = rhs.Grid()->_fdimensions[dimension]; + int rd = rhs.Grid()->_rdimensions[dimension]; + int pd = rhs.Grid()->_processors[dimension]; + int simd_layout = rhs.Grid()->_simd_layout[dimension]; + int comm_dim = rhs.Grid()->_processors[dimension] >1 ; + assert(simd_layout==1); + assert(comm_dim==1); + assert(shift>=0); + assert(shift_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; + static cshiftVector send_buf_v; send_buf_v.resize(buffer_size); + static cshiftVector recv_buf_v; recv_buf_v.resize(buffer_size); + vobj *send_buf; + vobj *recv_buf; + { + grid->ShmBufferFreeAll(); + size_t bytes = buffer_size*sizeof(vobj); + send_buf=(vobj *)grid->ShmBufferMalloc(bytes); + recv_buf=(vobj *)grid->ShmBufferMalloc(bytes); + } + + int cb= (cbmask==0x2)? Odd : Even; + int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); + + for(int x=0;x>1; + + int bytes = words * sizeof(vobj); + + Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask); + + // int rank = grid->_processor; + int recv_from_rank; + int xmit_to_rank; + grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); + + + grid->Barrier(); + + acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes); + grid->SendToRecvFrom((void *)&send_buf[0], + xmit_to_rank, + (void *)&recv_buf[0], + recv_from_rank, + bytes); + acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes); + + grid->Barrier(); + + Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask); + } + } +} + +template void Cshift_comms_simd(Lattice &ret,const Lattice &rhs,int dimension,int shift,int cbmask) +{ + GridBase *grid=rhs.Grid(); + const int Nsimd = grid->Nsimd(); + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_object scalar_object; + typedef typename vobj::scalar_type scalar_type; + + int fd = grid->_fdimensions[dimension]; + int rd = grid->_rdimensions[dimension]; + int ld = grid->_ldimensions[dimension]; + int pd = grid->_processors[dimension]; + int simd_layout = grid->_simd_layout[dimension]; + int comm_dim = grid->_processors[dimension] >1 ; + + //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<=0); + assert(shiftPermuteType(dimension); + + /////////////////////////////////////////////// + // Simd direction uses an extract/merge pair + /////////////////////////////////////////////// + int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; + // int words = sizeof(vobj)/sizeof(vector_type); + + static std::vector > send_buf_extract; send_buf_extract.resize(Nsimd); + static std::vector > recv_buf_extract; recv_buf_extract.resize(Nsimd); + scalar_object * recv_buf_extract_mpi; + scalar_object * send_buf_extract_mpi; + { + size_t bytes = sizeof(scalar_object)*buffer_size; + grid->ShmBufferFreeAll(); + send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); + recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes); + } + for(int s=0;s pointers(Nsimd); // + ExtractPointerArray rpointers(Nsimd); // received pointers + + /////////////////////////////////////////// + // Work out what to send where + /////////////////////////////////////////// + int cb = (cbmask==0x2)? Odd : Even; + int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); + + // loop over outer coord planes orthog to dim + for(int x=0;x>(permute_type+1)); + int ic= (i&inner_bit)? 1:0; + + int my_coor = rd*ic + x; + int nbr_coor = my_coor+sshift; + int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors + + int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer + int nbr_ox = (nbr_coor%rd); // outer coord of peer + int nbr_lane = (i&(~inner_bit)); + + int recv_from_rank; + int xmit_to_rank; + + if (nbr_ic) nbr_lane|=inner_bit; + + assert (sx == nbr_ox); + + if(nbr_proc){ + grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); + + grid->Barrier(); + + acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes); + grid->SendToRecvFrom((void *)send_buf_extract_mpi, + xmit_to_rank, + (void *)recv_buf_extract_mpi, + recv_from_rank, + bytes); + acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes); + grid->Barrier(); rpointers[i] = &recv_buf_extract[i][0]; } else { @@ -258,7 +461,7 @@ template void Cshift_comms_simd(Lattice &ret,const Lattice #include #include #include -//#include +#include #include #include #include diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index c43844f8..4a8a7423 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -342,19 +342,14 @@ inline void ExpressionViewClose(LatticeTrinaryExpression &expr) GridUnopClass(UnarySub, -a); GridUnopClass(UnaryNot, Not(a)); -GridUnopClass(UnaryAdj, adj(a)); -GridUnopClass(UnaryConj, conjugate(a)); GridUnopClass(UnaryTrace, trace(a)); GridUnopClass(UnaryTranspose, transpose(a)); GridUnopClass(UnaryTa, Ta(a)); GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a)); -GridUnopClass(UnaryToReal, toReal(a)); -GridUnopClass(UnaryToComplex, toComplex(a)); GridUnopClass(UnaryTimesI, timesI(a)); GridUnopClass(UnaryTimesMinusI, timesMinusI(a)); GridUnopClass(UnaryAbs, abs(a)); GridUnopClass(UnarySqrt, sqrt(a)); -GridUnopClass(UnaryRsqrt, rsqrt(a)); GridUnopClass(UnarySin, sin(a)); GridUnopClass(UnaryCos, cos(a)); GridUnopClass(UnaryAsin, asin(a)); @@ -456,20 +451,17 @@ GridTrinOpClass(TrinaryWhere, GRID_DEF_UNOP(operator-, UnarySub); GRID_DEF_UNOP(Not, UnaryNot); GRID_DEF_UNOP(operator!, UnaryNot); -GRID_DEF_UNOP(adj, UnaryAdj); -GRID_DEF_UNOP(conjugate, UnaryConj); +//GRID_DEF_UNOP(adj, UnaryAdj); +//GRID_DEF_UNOP(conjugate, UnaryConj); GRID_DEF_UNOP(trace, UnaryTrace); GRID_DEF_UNOP(transpose, UnaryTranspose); GRID_DEF_UNOP(Ta, UnaryTa); GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup); -GRID_DEF_UNOP(toReal, UnaryToReal); -GRID_DEF_UNOP(toComplex, UnaryToComplex); GRID_DEF_UNOP(timesI, UnaryTimesI); GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI); GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the // abs-fabs-dabs-labs thing GRID_DEF_UNOP(sqrt, UnarySqrt); -GRID_DEF_UNOP(rsqrt, UnaryRsqrt); GRID_DEF_UNOP(sin, UnarySin); GRID_DEF_UNOP(cos, UnaryCos); GRID_DEF_UNOP(asin, UnaryAsin); @@ -494,27 +486,27 @@ GRID_DEF_TRINOP(where, TrinaryWhere); ///////////////////////////////////////////////////////////// template auto closure(const LatticeUnaryExpression &expr) - -> Lattice + -> Lattice::type > { - Lattice ret(expr); + Lattice::type > ret(expr); return ret; } template auto closure(const LatticeBinaryExpression &expr) - -> Lattice + -> Lattice::type > { - Lattice ret(expr); + Lattice::type > ret(expr); return ret; } template auto closure(const LatticeTrinaryExpression &expr) - -> Lattice Lattice + vecEval(0, expr.arg3)))>::type > { - Lattice ret(expr); + vecEval(0, expr.arg3)))>::type > ret(expr); return ret; } #define EXPRESSION_CLOSURE(function) \ diff --git a/Grid/lattice/Lattice_basis.h b/Grid/lattice/Lattice_basis.h index af9d7280..863b2548 100644 --- a/Grid/lattice/Lattice_basis.h +++ b/Grid/lattice/Lattice_basis.h @@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) basis_v.push_back(basis[k].View(AcceleratorWrite)); } -#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) ) +#if ( (!defined(GRID_CUDA)) ) int max_threads = thread_max(); Vector < vobj > Bt(Nm * max_threads); thread_region @@ -161,11 +161,13 @@ void basisRotateJ(Field &result,std::vector &basis,Eigen::MatrixXd& Qt,in double * Qt_j = & Qt_jv[0]; for(int k=0;koSites(),vobj::Nsimd(),{ - auto B=coalescedRead(zz); + vobj zzz=Zero(); + auto B=coalescedRead(zzz); for(int k=k0; k inline Lattice adj(const Lattice &lhs){ autoView( ret_v, ret, AcceleratorWrite); ret.Checkerboard()=lhs.Checkerboard(); - accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { - coalescedWrite(ret_v[ss], adj(lhs_v(ss))); + accelerator_for( ss, lhs_v.size(), 1, { + ret_v[ss] = adj(lhs_v[ss]); }); return ret; }; @@ -64,6 +64,53 @@ template inline Lattice conjugate(const Lattice &lhs){ return ret; }; +template inline Lattice toComplex(const Lattice &lhs){ + Lattice ret(lhs.Grid()); + + autoView( lhs_v, lhs, AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); + + ret.Checkerboard() = lhs.Checkerboard(); + accelerator_for( ss, lhs_v.size(), 1, { + ret_v[ss] = toComplex(lhs_v[ss]); + }); + return ret; +}; +template inline Lattice toReal(const Lattice &lhs){ + Lattice ret(lhs.Grid()); + + autoView( lhs_v, lhs, AcceleratorRead); + autoView( ret_v, ret, AcceleratorWrite); + + ret.Checkerboard() = lhs.Checkerboard(); + accelerator_for( ss, lhs_v.size(), 1, { + ret_v[ss] = toReal(lhs_v[ss]); + }); + return ret; +}; + + +template::value,void>::type * = nullptr> +auto toComplex(const Expression &expr) -> decltype(closure(expr)) +{ + return toComplex(closure(expr)); +} +template::value,void>::type * = nullptr> +auto toReal(const Expression &expr) -> decltype(closure(expr)) +{ + return toReal(closure(expr)); +} +template::value,void>::type * = nullptr> +auto adj(const Expression &expr) -> decltype(closure(expr)) +{ + return adj(closure(expr)); +} +template::value,void>::type * = nullptr> +auto conjugate(const Expression &expr) -> decltype(closure(expr)) +{ + return conjugate(closure(expr)); +} + NAMESPACE_END(Grid); #endif diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index c2955485..0a5fbcb6 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -96,8 +96,34 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites) ssobj ret = ssum; return ret; } +/* +Threaded max, don't use for now +template +inline Double max(const Double *arg, Integer osites) +{ + // const int Nsimd = vobj::Nsimd(); + const int nthread = GridThread::GetThreads(); - + std::vector maxarray(nthread); + + thread_for(thr,nthread, { + int nwork, mywork, myoff; + nwork = osites; + GridThread::GetWork(nwork,thr,mywork,myoff); + Double max=arg[0]; + for(int ss=myoff;ss max ) max = arg[ss]; + } + maxarray[thr]=max; + }); + + Double tmax=maxarray[0]; + for(int i=0;itmax) tmax = maxarray[i]; + } + return tmax; +} +*/ template inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) { @@ -141,6 +167,32 @@ template inline RealD norm2(const Lattice &arg){ return real(nrm); } +//The global maximum of the site norm2 +template inline RealD maxLocalNorm2(const Lattice &arg) +{ + typedef typename vobj::tensor_reduced vscalar; //iScalar > > + typedef typename vscalar::scalar_object scalar; //iScalar > > + + Lattice inner = localNorm2(arg); + + auto grid = arg.Grid(); + + RealD max; + for(int l=0;llSites();l++){ + Coordinate coor; + scalar val; + RealD r; + grid->LocalIndexToLocalCoor(l,coor); + peekLocalSite(val,inner,coor); + r=real(TensorRemove(val)); + if( (l==0) || (r>max)){ + max=r; + } + } + grid->GlobalMax(max); + return max; +} + // Double inner product template inline ComplexD rankInnerProduct(const Lattice &left,const Lattice &right) diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index e698e40e..5a26cce9 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -97,6 +97,20 @@ accelerator_inline void convertType(ComplexF & out, const std::complex & out = in; } +template +accelerator_inline EnableIf> convertType(T & out, const T & in) { + out = in; +} + +// This would allow for conversions between GridFundamental types, but is not strictly needed as yet +/*template +accelerator_inline typename std::enable_if::value && isGridFundamental::value>::type +// Or to make this very broad, conversions between anything that's not a GridTensor could be allowed +//accelerator_inline typename std::enable_if::value && !isGridTensor::value>::type +convertType(T1 & out, const T2 & in) { + out = in; +}*/ + #ifdef GRID_SIMT accelerator_inline void convertType(vComplexF & out, const ComplexF & in) { ((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in; @@ -117,18 +131,18 @@ accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) { Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v); } -template - accelerator_inline void convertType(iMatrix & out, const iMatrix & in); -template - accelerator_inline void convertType(iVector & out, const iVector & in); +template +accelerator_inline void convertType(iScalar & out, const iScalar & in) { + convertType(out._internal,in._internal); +} -template::value, T1>::type* = nullptr> -accelerator_inline void convertType(T1 & out, const iScalar & in) { +template +accelerator_inline NotEnableIf> convertType(T1 & out, const iScalar & in) { convertType(out,in._internal); } template -accelerator_inline void convertType(iScalar & out, const T2 & in) { +accelerator_inline NotEnableIf> convertType(iScalar & out, const T2 & in) { convertType(out._internal,in); } @@ -145,11 +159,6 @@ accelerator_inline void convertType(iVector & out, const iVector & i convertType(out._internal[i],in._internal[i]); } -template::value, T>::type* = nullptr> -accelerator_inline void convertType(T & out, const T & in) { - out = in; -} - template accelerator_inline void convertType(Lattice & out, const Lattice & in) { autoView( out_v , out,AcceleratorWrite); diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h index 3b76b921..cb568abd 100644 --- a/Grid/lattice/Lattice_view.h +++ b/Grid/lattice/Lattice_view.h @@ -67,9 +67,14 @@ public: accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; } #endif +#if 1 + // accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; + accelerator_inline vobj & operator[](size_t i) const { return this->_odata[i]; }; +#else accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; }; - +#endif + accelerator_inline uint64_t begin(void) const { return 0;}; accelerator_inline uint64_t end(void) const { return this->_odata_size; }; accelerator_inline uint64_t size(void) const { return this->_odata_size; }; diff --git a/Grid/lattice/Lattice_where.h b/Grid/lattice/Lattice_where.h index 6686d1b3..777f4015 100644 --- a/Grid/lattice/Lattice_where.h +++ b/Grid/lattice/Lattice_where.h @@ -43,7 +43,7 @@ inline void whereWolf(Lattice &ret,const Lattice &predicate,Lattice< conformable(iftrue,predicate); conformable(iftrue,ret); - GridBase *grid=iftrue._grid; + GridBase *grid=iftrue.Grid(); typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_type scalar_type; @@ -52,22 +52,23 @@ inline void whereWolf(Lattice &ret,const Lattice &predicate,Lattice< const int Nsimd = grid->Nsimd(); - std::vector mask(Nsimd); - std::vector truevals (Nsimd); - std::vector falsevals(Nsimd); - - parallel_for(int ss=0;ssoSites(); ss++){ - - extract(iftrue._odata[ss] ,truevals); - extract(iffalse._odata[ss] ,falsevals); - extract(TensorRemove(predicate._odata[ss]),mask); - - for(int s=0;soSites(); + thread_for(ss,NN,{ + Integer mask; + scalar_object trueval; + scalar_object falseval; + for(int l=0;l @@ -76,9 +77,9 @@ inline Lattice whereWolf(const Lattice &predicate,Lattice &ift conformable(iftrue,iffalse); conformable(iftrue,predicate); - Lattice ret(iftrue._grid); + Lattice ret(iftrue.Grid()); - where(ret,predicate,iftrue,iffalse); + whereWolf(ret,predicate,iftrue,iffalse); return ret; } diff --git a/Grid/log/Log.h b/Grid/log/Log.h index d459a4a9..68693647 100644 --- a/Grid/log/Log.h +++ b/Grid/log/Log.h @@ -130,6 +130,8 @@ public: friend std::ostream& operator<< (std::ostream& stream, Logger& log){ if ( log.active ) { + std::ios_base::fmtflags f(stream.flags()); + stream << log.background()<< std::left; if (log.topWidth > 0) { @@ -152,6 +154,8 @@ public: << now << log.background() << " : " ; } stream << log.colour(); + stream.flags(f); + return stream; } else { return devnull; diff --git a/Grid/parallelIO/BinaryIO.cc b/Grid/parallelIO/BinaryIO.cc index 221a7fe8..ef1b6683 100644 --- a/Grid/parallelIO/BinaryIO.cc +++ b/Grid/parallelIO/BinaryIO.cc @@ -1,3 +1,4 @@ #include -int Grid::BinaryIO::latticeWriteMaxRetry = -1; +int Grid::BinaryIO::latticeWriteMaxRetry = -1; +Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf; diff --git a/Grid/parallelIO/BinaryIO.h b/Grid/parallelIO/BinaryIO.h index 1f11add9..e9893aa1 100644 --- a/Grid/parallelIO/BinaryIO.h +++ b/Grid/parallelIO/BinaryIO.h @@ -79,6 +79,13 @@ inline void removeWhitespace(std::string &key) /////////////////////////////////////////////////////////////////////////////////////////////////// class BinaryIO { public: + struct IoPerf + { + uint64_t size{0},time{0}; + double mbytesPerSecond{0.}; + }; + + static IoPerf lastPerf; static int latticeWriteMaxRetry; ///////////////////////////////////////////////////////////////////////////// @@ -502,12 +509,15 @@ class BinaryIO { timer.Stop(); } + lastPerf.size = sizeof(fobj)*iodata.size()*nrank; + lastPerf.time = timer.useconds(); + lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6); std::cout< tmp(RngStateCount); std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); - parallel_rng.SetState(tmp,lidx); + Coordinate lcoor; + grid->LocalIndexToLocalCoor(lidx, lcoor); + int o_idx=grid->oIndex(lcoor); + int i_idx=grid->iIndex(lcoor); + int gidx=parallel_rng.generator_idx(o_idx,i_idx); + parallel_rng.SetState(tmp,gidx); }); timer.Stop(); @@ -723,7 +738,12 @@ class BinaryIO { std::vector iodata(lsites); thread_for(lidx,lsites,{ std::vector tmp(RngStateCount); - parallel_rng.GetState(tmp,lidx); + Coordinate lcoor; + grid->LocalIndexToLocalCoor(lidx, lcoor); + int o_idx=grid->oIndex(lcoor); + int i_idx=grid->iIndex(lcoor); + int gidx=parallel_rng.generator_idx(o_idx,i_idx); + parallel_rng.GetState(tmp,gidx); std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); }); timer.Stop(); diff --git a/Grid/parallelIO/IldgIO.h b/Grid/parallelIO/IldgIO.h index b564371b..ef42c159 100644 --- a/Grid/parallelIO/IldgIO.h +++ b/Grid/parallelIO/IldgIO.h @@ -123,7 +123,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5); //////////////////////////////////////////////////////////// // Helper to fill out metadata //////////////////////////////////////////////////////////// - template void ScidacMetaData(Lattice & field, +template void ScidacMetaData(Lattice & field, FieldMetaData &header, scidacRecord & _scidacRecord, scidacFile & _scidacFile) @@ -619,12 +619,12 @@ class IldgWriter : public ScidacWriter { // Don't require scidac records EXCEPT checksum // Use Grid MetaData object if present. //////////////////////////////////////////////////////////////// - template - void writeConfiguration(Lattice > &Umu,int sequence,std::string LFN,std::string description) + template + void writeConfiguration(Lattice &Umu,int sequence,std::string LFN,std::string description) { GridBase * grid = Umu.Grid(); - typedef Lattice > GaugeField; - typedef iLorentzColourMatrix vobj; + typedef Lattice GaugeField; + typedef vLorentzColourMatrixD vobj; typedef typename vobj::scalar_object sobj; //////////////////////////////////////// @@ -636,6 +636,9 @@ class IldgWriter : public ScidacWriter { ScidacMetaData(Umu,header,_scidacRecord,_scidacFile); + stats Stats; + Stats(Umu,header); + std::string format = header.floating_point; header.ensemble_id = description; header.ensemble_label = description; @@ -705,10 +708,10 @@ class IldgReader : public GridLimeReader { // Else use ILDG MetaData object if present. // Else use SciDAC MetaData object if present. //////////////////////////////////////////////////////////////// - template - void readConfiguration(Lattice > &Umu, FieldMetaData &FieldMetaData_) { + template + void readConfiguration(Lattice &Umu, FieldMetaData &FieldMetaData_) { - typedef Lattice > GaugeField; + typedef Lattice GaugeField; typedef typename GaugeField::vector_object vobj; typedef typename vobj::scalar_object sobj; @@ -921,7 +924,8 @@ class IldgReader : public GridLimeReader { if ( found_FieldMetaData || found_usqcdInfo ) { FieldMetaData checker; - GaugeStatistics(Umu,checker); + stats Stats; + Stats(Umu,checker); assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5); assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5); std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl; diff --git a/Grid/parallelIO/MetaData.h b/Grid/parallelIO/MetaData.h index 4c1cfbdb..af8b3f76 100644 --- a/Grid/parallelIO/MetaData.h +++ b/Grid/parallelIO/MetaData.h @@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header) std::time_t t = std::time(nullptr); std::tm tm_ = *std::localtime(&t); std::ostringstream oss; - // oss << std::put_time(&tm_, "%c %Z"); + oss << std::put_time(&tm_, "%c %Z"); header.creation_date = oss.str(); header.archive_date = header.creation_date; @@ -176,29 +176,18 @@ template inline void PrepareMetaData(Lattice & field, FieldMet GridMetaData(grid,header); MachineCharacteristics(header); } -inline void GaugeStatistics(Lattice & data,FieldMetaData &header) +template +class GaugeStatistics { - // How to convert data precision etc... - header.link_trace=WilsonLoops::linkTrace(data); - header.plaquette =WilsonLoops::avgPlaquette(data); -} -inline void GaugeStatistics(Lattice & data,FieldMetaData &header) -{ - // How to convert data precision etc... - header.link_trace=WilsonLoops::linkTrace(data); - header.plaquette =WilsonLoops::avgPlaquette(data); -} -template<> inline void PrepareMetaData(Lattice & field, FieldMetaData &header) -{ - - GridBase *grid = field.Grid(); - std::string format = getFormatString(); - header.floating_point = format; - header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac - GridMetaData(grid,header); - GaugeStatistics(field,header); - MachineCharacteristics(header); -} +public: + void operator()(Lattice & data,FieldMetaData &header) + { + header.link_trace=WilsonLoops::linkTrace(data); + header.plaquette =WilsonLoops::avgPlaquette(data); + } +}; +typedef GaugeStatistics PeriodicGaugeStatistics; +typedef GaugeStatistics ConjugateGaugeStatistics; template<> inline void PrepareMetaData(Lattice & field, FieldMetaData &header) { GridBase *grid = field.Grid(); @@ -206,7 +195,6 @@ template<> inline void PrepareMetaData(Lattice GaugeField; + static inline void truncate(std::string file){ std::ofstream fout(file,std::ios::out); } @@ -129,12 +131,12 @@ public: // Now the meat: the object readers ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - template - static inline void readConfiguration(Lattice > &Umu, + template + static inline void readConfiguration(GaugeField &Umu, FieldMetaData& header, - std::string file) + std::string file, + GaugeStats GaugeStatisticsCalculator=GaugeStats()) { - typedef Lattice > GaugeField; GridBase *grid = Umu.Grid(); uint64_t offset = readHeader(file,Umu.Grid(),header); @@ -153,23 +155,23 @@ public: // munger is a function of if ( header.data_type == std::string("4D_SU3_GAUGE") ) { if ( ieee32 || ieee32big ) { - BinaryIO::readLatticeObject, LorentzColour2x3F> + BinaryIO::readLatticeObject (Umu,file,Gauge3x2munger(), offset,format, nersc_csum,scidac_csuma,scidac_csumb); } if ( ieee64 || ieee64big ) { - BinaryIO::readLatticeObject, LorentzColour2x3D> + BinaryIO::readLatticeObject (Umu,file,Gauge3x2munger(),offset,format, nersc_csum,scidac_csuma,scidac_csumb); } } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) { if ( ieee32 || ieee32big ) { - BinaryIO::readLatticeObject,LorentzColourMatrixF> + BinaryIO::readLatticeObject (Umu,file,GaugeSimpleMunger(),offset,format, nersc_csum,scidac_csuma,scidac_csumb); } if ( ieee64 || ieee64big ) { - BinaryIO::readLatticeObject,LorentzColourMatrixD> + BinaryIO::readLatticeObject (Umu,file,GaugeSimpleMunger(),offset,format, nersc_csum,scidac_csuma,scidac_csumb); } @@ -177,7 +179,7 @@ public: assert(0); } - GaugeStatistics(Umu,clone); + GaugeStats Stats; Stats(Umu,clone); std::cout< - static inline void writeConfiguration(Lattice > &Umu, + // Preferred interface + template + static inline void writeConfiguration(Lattice &Umu, + std::string file, + std::string ens_label = std::string("DWF")) + { + writeConfiguration(Umu,file,0,1,ens_label); + } + template + static inline void writeConfiguration(Lattice &Umu, std::string file, int two_row, - int bits32) + int bits32, + std::string ens_label = std::string("DWF")) { - typedef Lattice > GaugeField; - - typedef iLorentzColourMatrix vobj; + typedef vLorentzColourMatrixD vobj; typedef typename vobj::scalar_object sobj; FieldMetaData header; @@ -219,8 +228,8 @@ public: // Following should become arguments /////////////////////////////////////////// header.sequence_number = 1; - header.ensemble_id = "UKQCD"; - header.ensemble_label = "DWF"; + header.ensemble_id = std::string("UKQCD"); + header.ensemble_label = ens_label; typedef LorentzColourMatrixD fobj3D; typedef LorentzColour2x3D fobj2D; @@ -229,28 +238,28 @@ public: GridMetaData(grid,header); assert(header.nd==4); - GaugeStatistics(Umu,header); + GaugeStats Stats; Stats(Umu,header); MachineCharacteristics(header); - uint64_t offset; + uint64_t offset; // Sod it -- always write 3x3 double header.floating_point = std::string("IEEE64BIG"); header.data_type = std::string("4D_SU3_GAUGE_3x3"); GaugeSimpleUnmunger munge; - if ( grid->IsBoss() ) { - truncate(file); - offset = writeHeader(header,file); - } - grid->Broadcast(0,(void *)&offset,sizeof(offset)); + if ( grid->IsBoss() ) { + truncate(file); + offset = writeHeader(header,file); + } + grid->Broadcast(0,(void *)&offset,sizeof(offset)); uint32_t nersc_csum,scidac_csuma,scidac_csumb; BinaryIO::writeLatticeObject(Umu,file,munge,offset,header.floating_point, nersc_csum,scidac_csuma,scidac_csumb); header.checksum = nersc_csum; - if ( grid->IsBoss() ) { - writeHeader(header,file); - } + if ( grid->IsBoss() ) { + writeHeader(header,file); + } std::cout<Barrier(); timer.Stop(); std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl; - GaugeStatistics(Umu, clone); + PeriodicGaugeStatistics Stats; Stats(Umu, clone); RealD plaq_diff = fabs(clone.plaquette - header.plaquette); diff --git a/Grid/parallelIO/OpenQcdIOChromaReference.h b/Grid/parallelIO/OpenQcdIOChromaReference.h index bab54fe8..886536ad 100644 --- a/Grid/parallelIO/OpenQcdIOChromaReference.h +++ b/Grid/parallelIO/OpenQcdIOChromaReference.h @@ -208,7 +208,7 @@ public: FieldMetaData clone(header); - GaugeStatistics(Umu, clone); + PeriodicGaugeStatistics Stats; Stats(Umu, clone); RealD plaq_diff = fabs(clone.plaquette - header.plaquette); diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h index faacac63..858aead7 100644 --- a/Grid/qcd/QCD.h +++ b/Grid/qcd/QCD.h @@ -47,7 +47,7 @@ static constexpr int Ym = 5; static constexpr int Zm = 6; static constexpr int Tm = 7; -static constexpr int Nc=3; +static constexpr int Nc=Config_Nc; static constexpr int Ns=4; static constexpr int Nd=4; static constexpr int Nhs=2; // half spinor @@ -80,6 +80,13 @@ template struct isSpinor { template using IfSpinor = Invoke::value,int> > ; template using IfNotSpinor = Invoke::value,int> > ; +const int CoarseIndex = 4; +template struct isCoarsened { + static constexpr bool value = (CoarseIndex<=T::TensorLevel); +}; +template using IfCoarsened = Invoke::value,int> > ; +template using IfNotCoarsened = Invoke::value,int> > ; + // ChrisK very keen to add extra space for Gparity doubling. // // Also add domain wall index, in a way where Wilson operator diff --git a/Grid/qcd/action/ActionBase.h b/Grid/qcd/action/ActionBase.h index bff21d1d..17980ee0 100644 --- a/Grid/qcd/action/ActionBase.h +++ b/Grid/qcd/action/ActionBase.h @@ -41,7 +41,7 @@ class Action public: bool is_smeared = false; // Heatbath? - virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions + virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions virtual RealD S(const GaugeField& U) = 0; // evaluate the action virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative virtual std::string action_name() = 0; // return the action name diff --git a/Grid/qcd/action/fermion/Fermion.h b/Grid/qcd/action/fermion/Fermion.h index 16252340..09777204 100644 --- a/Grid/qcd/action/fermion/Fermion.h +++ b/Grid/qcd/action/fermion/Fermion.h @@ -291,12 +291,6 @@ typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DR; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermion5DD; -#ifndef GRID_CUDA -typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dR; -typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dF; -typedef ImprovedStaggeredFermion5D ImprovedStaggeredFermionVec5dD; -#endif - NAMESPACE_END(Grid); //////////////////// diff --git a/Grid/qcd/action/fermion/FermionOperatorImpl.h b/Grid/qcd/action/fermion/FermionOperatorImpl.h index b444f6dc..56aaca12 100644 --- a/Grid/qcd/action/fermion/FermionOperatorImpl.h +++ b/Grid/qcd/action/fermion/FermionOperatorImpl.h @@ -153,8 +153,8 @@ public: typedef typename Impl::StencilImpl StencilImpl; \ typedef typename Impl::ImplParams ImplParams; \ typedef typename Impl::StencilImpl::View_type StencilView; \ - typedef typename ViewMap::Type FermionFieldView; \ - typedef typename ViewMap::Type DoubledGaugeFieldView; + typedef const typename ViewMap::Type FermionFieldView; \ + typedef const typename ViewMap::Type DoubledGaugeFieldView; #define INHERIT_IMPL_TYPES(Base) \ INHERIT_GIMPL_TYPES(Base) \ @@ -183,7 +183,8 @@ NAMESPACE_CHECK(ImplStaggered); ///////////////////////////////////////////////////////////////////////////// // Single flavour one component spinors with colour index. 5d vec ///////////////////////////////////////////////////////////////////////////// -#include -NAMESPACE_CHECK(ImplStaggered5dVec); +// Deprecate Vec5d +//#include +//NAMESPACE_CHECK(ImplStaggered5dVec); diff --git a/Grid/qcd/action/fermion/GparityWilsonImpl.h b/Grid/qcd/action/fermion/GparityWilsonImpl.h index 0b726db9..9dca403b 100644 --- a/Grid/qcd/action/fermion/GparityWilsonImpl.h +++ b/Grid/qcd/action/fermion/GparityWilsonImpl.h @@ -97,42 +97,30 @@ public: Coordinate icoor; #ifdef GRID_SIMT - _Spinor tmp; - const int Nsimd =SiteDoubledGaugeField::Nsimd(); int s = acceleratorSIMTlane(Nsimd); St.iCoorFromIindex(icoor,s); int mmu = mu % Nd; - if ( SE->_around_the_world && St.parameters.twists[mmu] ) { - - int permute_lane = (sl==1) - || ((distance== 1)&&(icoor[direction]==1)) - || ((distance==-1)&&(icoor[direction]==0)); - if ( permute_lane ) { - tmp(0) = chi(1); - tmp(1) = chi(0); - } else { - tmp(0) = chi(0); - tmp(1) = chi(1); - } + auto UU0=coalescedRead(U(0)(mu)); + auto UU1=coalescedRead(U(1)(mu)); + + //Decide whether we do a G-parity flavor twist + //Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir + //It also assumes (but does not check) that abs(distance) == 1 + int permute_lane = (sl==1) + || ((distance== 1)&&(icoor[direction]==1)) + || ((distance==-1)&&(icoor[direction]==0)); - auto UU0=coalescedRead(U(0)(mu)); - auto UU1=coalescedRead(U(1)(mu)); + permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world - mult(&phi(0),&UU0,&tmp(0)); - mult(&phi(1),&UU1,&tmp(1)); + //Apply the links + int f_upper = permute_lane ? 1 : 0; + int f_lower = !f_upper; - } else { - - auto UU0=coalescedRead(U(0)(mu)); - auto UU1=coalescedRead(U(1)(mu)); - - mult(&phi(0),&UU0,&chi(0)); - mult(&phi(1),&UU1,&chi(1)); - - } + mult(&phi(0),&UU0,&chi(f_upper)); + mult(&phi(1),&UU1,&chi(f_lower)); #else typedef _Spinor vobj; diff --git a/Grid/qcd/action/fermion/MADWF.h b/Grid/qcd/action/fermion/MADWF.h index 6b3c6e71..5d17e865 100644 --- a/Grid/qcd/action/fermion/MADWF.h +++ b/Grid/qcd/action/fermion/MADWF.h @@ -85,7 +85,7 @@ class MADWF maxiter =_maxiter; }; - void operator() (const FermionFieldo &src4,FermionFieldo &sol5) + void operator() (const FermionFieldo &src,FermionFieldo &sol5) { std::cout << GridLogMessage<< " ************************************************" << std::endl; std::cout << GridLogMessage<< " MADWF-like algorithm " << std::endl; @@ -114,8 +114,16 @@ class MADWF /////////////////////////////////////// //Import source, include Dminus factors /////////////////////////////////////// - Mato.ImportPhysicalFermionSource(src4,b); - std::cout << GridLogMessage << " src4 " < + static accelerator_inline void multLink(_Spinor &phi, const SiteDoubledGaugeField &U, - const SiteSpinor &chi, + const _Spinor &chi, int mu) { - mult(&phi(), &U(mu), &chi()); + auto UU = coalescedRead(U(mu)); + mult(&phi(), &UU, &chi()); } - static accelerator_inline void multLinkAdd(SiteSpinor &phi, + template + static accelerator_inline void multLinkAdd(_Spinor &phi, const SiteDoubledGaugeField &U, - const SiteSpinor &chi, + const _Spinor &chi, int mu) { - mac(&phi(), &U(mu), &chi()); + auto UU = coalescedRead(U(mu)); + mac(&phi(), &UU, &chi()); } template diff --git a/Grid/qcd/action/fermion/WilsonCloverFermion.h b/Grid/qcd/action/fermion/WilsonCloverFermion.h index 91ad6d6d..92af7111 100644 --- a/Grid/qcd/action/fermion/WilsonCloverFermion.h +++ b/Grid/qcd/action/fermion/WilsonCloverFermion.h @@ -245,7 +245,7 @@ public: return out; } -private: +protected: // here fixing the 4 dimensions, make it more general? RealD csw_r; // Clover coefficient - spatial diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index 10e98f33..0760bcba 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -61,7 +61,7 @@ public: typedef typename SiteHalfSpinor::vector_type vComplexHigh; constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); - accelerator_inline int CommDatumSize(void) { + accelerator_inline int CommDatumSize(void) const { return sizeof(SiteHalfCommSpinor); } @@ -69,7 +69,7 @@ public: /* Compress includes precision change if mpi data is not same */ /*****************************************************/ template - accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) { + accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const { _SiteHalfSpinor tmp; projector::Proj(tmp,in,mu,dag); vstream(buf[o],tmp); @@ -81,7 +81,7 @@ public: accelerator_inline void Exchange(SiteHalfSpinor *mp, const SiteHalfSpinor * __restrict__ vp0, const SiteHalfSpinor * __restrict__ vp1, - Integer type,Integer o){ + Integer type,Integer o) const { SiteHalfSpinor tmp1; SiteHalfSpinor tmp2; exchange(tmp1,tmp2,vp0[o],vp1[o],type); @@ -93,7 +93,7 @@ public: /* Have a decompression step if mpi data is not same */ /*****************************************************/ accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out, - SiteHalfSpinor * __restrict__ in, Integer o) { + SiteHalfSpinor * __restrict__ in, Integer o) const { assert(0); } @@ -103,7 +103,7 @@ public: accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0, SiteHalfSpinor * __restrict__ out1, const SiteSpinor * __restrict__ in, - Integer j,Integer k, Integer m,Integer type) + Integer j,Integer k, Integer m,Integer type) const { SiteHalfSpinor temp1, temp2; SiteHalfSpinor temp3, temp4; @@ -117,7 +117,7 @@ public: /*****************************************************/ /* Pass the info to the stencil */ /*****************************************************/ - accelerator_inline bool DecompressionStep(void) { return false; } + accelerator_inline bool DecompressionStep(void) const { return false; } }; @@ -142,7 +142,7 @@ public: typedef typename SiteHalfSpinor::vector_type vComplexHigh; constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); - accelerator_inline int CommDatumSize(void) { + accelerator_inline int CommDatumSize(void) const { return sizeof(SiteHalfCommSpinor); } @@ -150,7 +150,7 @@ public: /* Compress includes precision change if mpi data is not same */ /*****************************************************/ template - accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) { + accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const { _SiteHalfSpinor hsp; SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf; projector::Proj(hsp,in,mu,dag); @@ -163,7 +163,7 @@ public: accelerator_inline void Exchange(SiteHalfSpinor *mp, SiteHalfSpinor *vp0, SiteHalfSpinor *vp1, - Integer type,Integer o){ + Integer type,Integer o) const { SiteHalfSpinor vt0,vt1; SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0; SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1; @@ -175,7 +175,7 @@ public: /*****************************************************/ /* Have a decompression step if mpi data is not same */ /*****************************************************/ - accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o){ + accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const { SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in; precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw); } @@ -186,7 +186,7 @@ public: accelerator_inline void CompressExchange(SiteHalfSpinor *out0, SiteHalfSpinor *out1, const SiteSpinor *in, - Integer j,Integer k, Integer m,Integer type){ + Integer j,Integer k, Integer m,Integer type) const { SiteHalfSpinor temp1, temp2,temp3,temp4; SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0; SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1; @@ -200,7 +200,7 @@ public: /*****************************************************/ /* Pass the info to the stencil */ /*****************************************************/ - accelerator_inline bool DecompressionStep(void) { return true; } + accelerator_inline bool DecompressionStep(void) const { return true; } }; diff --git a/Grid/qcd/action/fermion/WilsonImpl.h b/Grid/qcd/action/fermion/WilsonImpl.h index 52e1ee00..2ff6feba 100644 --- a/Grid/qcd/action/fermion/WilsonImpl.h +++ b/Grid/qcd/action/fermion/WilsonImpl.h @@ -72,7 +72,7 @@ public: typedef WilsonCompressor Compressor; typedef WilsonImplParams ImplParams; typedef WilsonStencil StencilImpl; - typedef typename StencilImpl::View_type StencilView; + typedef const typename StencilImpl::View_type StencilView; ImplParams Params; @@ -106,11 +106,15 @@ public: const _SpinorField & phi, int mu) { + const int Nsimd = SiteHalfSpinor::Nsimd(); autoView( out_v, out, AcceleratorWrite); autoView( phi_v, phi, AcceleratorRead); autoView( Umu_v, Umu, AcceleratorRead); - accelerator_for(sss,out.Grid()->oSites(),1,{ - multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu); + typedef decltype(coalescedRead(out_v[0])) calcSpinor; + accelerator_for(sss,out.Grid()->oSites(),Nsimd,{ + calcSpinor tmp; + multLink(tmp,Umu_v[sss],phi_v(sss),mu); + coalescedWrite(out_v[sss],tmp); }); } @@ -180,18 +184,22 @@ public: mat = TraceIndex(P); } - inline void extractLinkField(std::vector &mat, DoubledGaugeField &Uds){ + inline void extractLinkField(std::vector &mat, DoubledGaugeField &Uds) + { for (int mu = 0; mu < Nd; mu++) mat[mu] = PeekIndex(Uds, mu); } - - inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){ - + inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu) + { +#undef USE_OLD_INSERT_FORCE int Ls=Btilde.Grid()->_fdimensions[0]; + autoView( mat_v , mat, AcceleratorWrite); +#ifdef USE_OLD_INSERT_FORCE GaugeLinkField tmp(mat.Grid()); tmp = Zero(); { + const int Nsimd = SiteSpinor::Nsimd(); autoView( tmp_v , tmp, AcceleratorWrite); autoView( Btilde_v , Btilde, AcceleratorRead); autoView( Atilde_v , Atilde, AcceleratorRead); @@ -204,6 +212,29 @@ public: }); } PokeIndex(mat,tmp,mu); +#else + { + const int Nsimd = SiteSpinor::Nsimd(); + autoView( Btilde_v , Btilde, AcceleratorRead); + autoView( Atilde_v , Atilde, AcceleratorRead); + accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{ + int sU=sss; + typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType; + ColorMatrixType sum; + zeroit(sum); + for(int s=0;s Base; - + typedef AcceleratorVector StencilVector; public: +#ifdef GRID_SYCL +#define SYCL_HACK +#endif +#ifdef SYCL_HACK + static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf, + int ss,int sU,const SiteSpinor *in, SiteSpinor *out); +#endif + static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int interior=1,int exterior=1) ; diff --git a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h index b3fbe096..c3e0f821 100644 --- a/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CayleyFermion5DImplementation.h @@ -642,7 +642,7 @@ void CayleyFermion5D::ContractConservedCurrent( PropagatorField &q_in_1, Current curr_type, unsigned int mu) { -#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) +#if (!defined(GRID_HIP)) Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, Gamma::Algebra::GammaY, @@ -826,7 +826,7 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, } #endif -#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) +#if (!defined(GRID_HIP)) int tshift = (mu == Nd-1) ? 1 : 0; //////////////////////////////////////////////// // GENERAL CAYLEY CASE @@ -880,11 +880,23 @@ void CayleyFermion5D::SeqConservedCurrent(PropagatorField &q_in, } std::vector G_s(Ls,1.0); + Integer sign = 1; // sign flip for vector/tadpole if ( curr_type == Current::Axial ) { for(int s=0;s_b; + auto c=this->_c; + if ( b == 1 && c == 0 ) { + sign = -1; + } + else { + std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl; + assert(b==1 && c==0); + } + } for(int s=0;s::SeqConservedCurrent(PropagatorField &q_in, tmp = Cshift(tmp,mu,1); Impl::multLinkField(Utmp,this->Umu,tmp,mu); - tmp = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop + tmp = sign*G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop tmp = where((lcoor>=tmin),tmp,zz); // Mask the time L_Q = where((lcoor<=tmax),tmp,zz); // Position of current complicated diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h index 63fd2a2f..e9cacbcf 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h @@ -680,7 +680,8 @@ void StaggeredKernels::DhopSiteAsm(StencilView &st, gauge2 =(uint64_t)&UU[sU]( Z ); \ gauge3 =(uint64_t)&UU[sU]( T ); - +#undef STAG_VEC5D +#ifdef STAG_VEC5D // This is the single precision 5th direction vectorised kernel #include template <> void StaggeredKernels::DhopSiteAsm(StencilView &st, @@ -790,7 +791,7 @@ template <> void StaggeredKernels::DhopSiteAsm(StencilView #endif } - +#endif #define PERMUTE_DIR3 __asm__ ( \ diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h index 6bcb22b4..2b6087bc 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsHand.h @@ -32,25 +32,50 @@ Author: paboyle NAMESPACE_BEGIN(Grid); -#define LOAD_CHI(b) \ +#ifdef GRID_SIMT + +#define LOAD_CHI(ptype,b) \ + const SiteSpinor & ref (b[offset]); \ + Chi_0=coalescedReadPermute(ref()()(0),perm,lane); \ + Chi_1=coalescedReadPermute(ref()()(1),perm,lane); \ + Chi_2=coalescedReadPermute(ref()()(2),perm,lane); + +#define LOAD_CHI_COMMS(b) \ const SiteSpinor & ref (b[offset]); \ - Chi_0=ref()()(0);\ - Chi_1=ref()()(1);\ - Chi_2=ref()()(2); + Chi_0=coalescedRead(ref()()(0),lane); \ + Chi_1=coalescedRead(ref()()(1),lane); \ + Chi_2=coalescedRead(ref()()(2),lane); + +#define PERMUTE_DIR(dir) ; +#else +#define LOAD_CHI(ptype,b) LOAD_CHI_COMMS(b) + +#define LOAD_CHI_COMMS(b) \ + const SiteSpinor & ref (b[offset]); \ + Chi_0=ref()()(0); \ + Chi_1=ref()()(1); \ + Chi_2=ref()()(2); + +#define PERMUTE_DIR(dir) \ + permute##dir(Chi_0,Chi_0); \ + permute##dir(Chi_1,Chi_1); \ + permute##dir(Chi_2,Chi_2); + +#endif // To splat or not to splat depends on the implementation #define MULT(A,UChi) \ auto & ref(U[sU](A)); \ - Impl::loadLinkElement(U_00,ref()(0,0)); \ - Impl::loadLinkElement(U_10,ref()(1,0)); \ - Impl::loadLinkElement(U_20,ref()(2,0)); \ - Impl::loadLinkElement(U_01,ref()(0,1)); \ - Impl::loadLinkElement(U_11,ref()(1,1)); \ - Impl::loadLinkElement(U_21,ref()(2,1)); \ - Impl::loadLinkElement(U_02,ref()(0,2)); \ - Impl::loadLinkElement(U_12,ref()(1,2)); \ - Impl::loadLinkElement(U_22,ref()(2,2)); \ + U_00=coalescedRead(ref()(0,0),lane); \ + U_10=coalescedRead(ref()(1,0),lane); \ + U_20=coalescedRead(ref()(2,0),lane); \ + U_01=coalescedRead(ref()(0,1),lane); \ + U_11=coalescedRead(ref()(1,1),lane); \ + U_21=coalescedRead(ref()(2,1),lane); \ + U_02=coalescedRead(ref()(0,2),lane); \ + U_12=coalescedRead(ref()(1,2),lane); \ + U_22=coalescedRead(ref()(2,2),lane); \ UChi ## _0 = U_00*Chi_0; \ UChi ## _1 = U_10*Chi_0;\ UChi ## _2 = U_20*Chi_0;\ @@ -63,15 +88,15 @@ NAMESPACE_BEGIN(Grid); #define MULT_ADD(U,A,UChi) \ auto & ref(U[sU](A)); \ - Impl::loadLinkElement(U_00,ref()(0,0)); \ - Impl::loadLinkElement(U_10,ref()(1,0)); \ - Impl::loadLinkElement(U_20,ref()(2,0)); \ - Impl::loadLinkElement(U_01,ref()(0,1)); \ - Impl::loadLinkElement(U_11,ref()(1,1)); \ - Impl::loadLinkElement(U_21,ref()(2,1)); \ - Impl::loadLinkElement(U_02,ref()(0,2)); \ - Impl::loadLinkElement(U_12,ref()(1,2)); \ - Impl::loadLinkElement(U_22,ref()(2,2)); \ + U_00=coalescedRead(ref()(0,0),lane); \ + U_10=coalescedRead(ref()(1,0),lane); \ + U_20=coalescedRead(ref()(2,0),lane); \ + U_01=coalescedRead(ref()(0,1),lane); \ + U_11=coalescedRead(ref()(1,1),lane); \ + U_21=coalescedRead(ref()(2,1),lane); \ + U_02=coalescedRead(ref()(0,2),lane); \ + U_12=coalescedRead(ref()(1,2),lane); \ + U_22=coalescedRead(ref()(2,2),lane); \ UChi ## _0 += U_00*Chi_0; \ UChi ## _1 += U_10*Chi_0;\ UChi ## _2 += U_20*Chi_0;\ @@ -83,24 +108,18 @@ NAMESPACE_BEGIN(Grid); UChi ## _2 += U_22*Chi_2; -#define PERMUTE_DIR(dir) \ - permute##dir(Chi_0,Chi_0); \ - permute##dir(Chi_1,Chi_1); \ - permute##dir(Chi_2,Chi_2); - - #define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ SE=st.GetEntry(ptype,Dir+skew,sF); \ offset = SE->_offset; \ local = SE->_is_local; \ perm = SE->_permute; \ if ( local ) { \ - LOAD_CHI(in); \ + LOAD_CHI(Perm,in); \ if ( perm) { \ PERMUTE_DIR(Perm); \ } \ } else { \ - LOAD_CHI(buf); \ + LOAD_CHI_COMMS(buf); \ } #define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \ @@ -116,19 +135,18 @@ NAMESPACE_BEGIN(Grid); } - #define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \ SE=st.GetEntry(ptype,Dir+skew,sF); \ offset = SE->_offset; \ local = SE->_is_local; \ perm = SE->_permute; \ if ( local ) { \ - LOAD_CHI(in); \ + LOAD_CHI(Perm,in); \ if ( perm) { \ PERMUTE_DIR(Perm); \ } \ } else if ( st.same_node[Dir] ) { \ - LOAD_CHI(buf); \ + LOAD_CHI_COMMS(buf); \ } \ if (local || st.same_node[Dir] ) { \ MULT_ADD(U,Dir,even); \ @@ -140,10 +158,32 @@ NAMESPACE_BEGIN(Grid); local = SE->_is_local; \ if ((!local) && (!st.same_node[Dir]) ) { \ nmu++; \ - { LOAD_CHI(buf); } \ + { LOAD_CHI_COMMS(buf); } \ { MULT_ADD(U,Dir,even); } \ } +#define HAND_DECLARATIONS(Simd) \ + Simd even_0; \ + Simd even_1; \ + Simd even_2; \ + Simd odd_0; \ + Simd odd_1; \ + Simd odd_2; \ + \ + Simd Chi_0; \ + Simd Chi_1; \ + Simd Chi_2; \ + \ + Simd U_00; \ + Simd U_10; \ + Simd U_20; \ + Simd U_01; \ + Simd U_11; \ + Simd U_21; \ + Simd U_02; \ + Simd U_12; \ + Simd U_22; + template template accelerator_inline @@ -155,28 +195,14 @@ void StaggeredKernels::DhopSiteHand(StencilView &st, typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - Simd even_0; // 12 regs on knc - Simd even_1; - Simd even_2; - Simd odd_0; // 12 regs on knc - Simd odd_1; - Simd odd_2; - Simd Chi_0; // two spinor; 6 regs - Simd Chi_1; - Simd Chi_2; - - Simd U_00; // two rows of U matrix - Simd U_10; - Simd U_20; - Simd U_01; - Simd U_11; - Simd U_21; // 2 reg left. - Simd U_02; - Simd U_12; - Simd U_22; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + typedef decltype( coalescedRead( in[0]()()(0) )) Simt; + HAND_DECLARATIONS(Simt); - SiteSpinor result; + typedef decltype( coalescedRead( in[0] )) calcSiteSpinor; + calcSiteSpinor result; int offset,local,perm, ptype; StencilEntry *SE; @@ -215,7 +241,7 @@ void StaggeredKernels::DhopSiteHand(StencilView &st, result()()(1) = even_1 + odd_1; result()()(2) = even_2 + odd_2; } - vstream(out[sF],result); + coalescedWrite(out[sF],result); } } @@ -230,28 +256,13 @@ void StaggeredKernels::DhopSiteHandInt(StencilView &st, typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - Simd even_0; // 12 regs on knc - Simd even_1; - Simd even_2; - Simd odd_0; // 12 regs on knc - Simd odd_1; - Simd odd_2; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + typedef decltype( coalescedRead( in[0]()()(0) )) Simt; + HAND_DECLARATIONS(Simt); - Simd Chi_0; // two spinor; 6 regs - Simd Chi_1; - Simd Chi_2; - - Simd U_00; // two rows of U matrix - Simd U_10; - Simd U_20; - Simd U_01; - Simd U_11; - Simd U_21; // 2 reg left. - Simd U_02; - Simd U_12; - Simd U_22; - - SiteSpinor result; + typedef decltype( coalescedRead( in[0] )) calcSiteSpinor; + calcSiteSpinor result; int offset, ptype, local, perm; StencilEntry *SE; @@ -261,8 +272,8 @@ void StaggeredKernels::DhopSiteHandInt(StencilView &st, // int sF=s+LLs*sU; { - even_0 = Zero(); even_1 = Zero(); even_2 = Zero(); - odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero(); + zeroit(even_0); zeroit(even_1); zeroit(even_2); + zeroit(odd_0); zeroit(odd_1); zeroit(odd_2); skew = 0; HAND_STENCIL_LEG_INT(U,Xp,3,skew,even); @@ -294,7 +305,7 @@ void StaggeredKernels::DhopSiteHandInt(StencilView &st, result()()(1) = even_1 + odd_1; result()()(2) = even_2 + odd_2; } - vstream(out[sF],result); + coalescedWrite(out[sF],result); } } @@ -309,28 +320,13 @@ void StaggeredKernels::DhopSiteHandExt(StencilView &st, typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; - Simd even_0; // 12 regs on knc - Simd even_1; - Simd even_2; - Simd odd_0; // 12 regs on knc - Simd odd_1; - Simd odd_2; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + typedef decltype( coalescedRead( in[0]()()(0) )) Simt; + HAND_DECLARATIONS(Simt); - Simd Chi_0; // two spinor; 6 regs - Simd Chi_1; - Simd Chi_2; - - Simd U_00; // two rows of U matrix - Simd U_10; - Simd U_20; - Simd U_01; - Simd U_11; - Simd U_21; // 2 reg left. - Simd U_02; - Simd U_12; - Simd U_22; - - SiteSpinor result; + typedef decltype( coalescedRead( in[0] )) calcSiteSpinor; + calcSiteSpinor result; int offset, ptype, local; StencilEntry *SE; @@ -340,8 +336,8 @@ void StaggeredKernels::DhopSiteHandExt(StencilView &st, // int sF=s+LLs*sU; { - even_0 = Zero(); even_1 = Zero(); even_2 = Zero(); - odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero(); + zeroit(even_0); zeroit(even_1); zeroit(even_2); + zeroit(odd_0); zeroit(odd_1); zeroit(odd_2); int nmu=0; skew = 0; HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even); @@ -374,7 +370,7 @@ void StaggeredKernels::DhopSiteHandExt(StencilView &st, result()()(1) = even_1 + odd_1; result()()(2) = even_2 + odd_2; } - out[sF] = out[sF] + result; + coalescedWrite(out[sF] , out(sF)+ result); } } } @@ -397,6 +393,7 @@ void StaggeredKernels::DhopSiteHandExt(StencilView &st, const FermionFieldView &in, FermionFieldView &out, int dag); \ */ #undef LOAD_CHI +#undef HAND_DECLARATIONS NAMESPACE_END(Grid); diff --git a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h index 0b6f9fb0..dd62e109 100644 --- a/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/StaggeredKernelsImplementation.h @@ -35,39 +35,32 @@ NAMESPACE_BEGIN(Grid); #define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \ SE = st.GetEntry(ptype, Dir+skew, sF); \ if (SE->_is_local ) { \ - if (SE->_permute) { \ - chi_p = χ \ - permute(chi, in[SE->_offset], ptype); \ - } else { \ - chi_p = &in[SE->_offset]; \ - } \ + int perm= SE->_permute; \ + chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\ } else { \ - chi_p = &buf[SE->_offset]; \ + chi = coalescedRead(buf[SE->_offset],lane); \ } \ - multLink(Uchi, U[sU], *chi_p, Dir); + acceleratorSynchronise(); \ + multLink(Uchi, U[sU], chi, Dir); #define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \ SE = st.GetEntry(ptype, Dir+skew, sF); \ if (SE->_is_local ) { \ - if (SE->_permute) { \ - chi_p = χ \ - permute(chi, in[SE->_offset], ptype); \ - } else { \ - chi_p = &in[SE->_offset]; \ - } \ + int perm= SE->_permute; \ + chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\ } else if ( st.same_node[Dir] ) { \ - chi_p = &buf[SE->_offset]; \ + chi = coalescedRead(buf[SE->_offset],lane); \ } \ if (SE->_is_local || st.same_node[Dir] ) { \ - multLink(Uchi, U[sU], *chi_p, Dir); \ + multLink(Uchi, U[sU], chi, Dir); \ } #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \ SE = st.GetEntry(ptype, Dir+skew, sF); \ if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ nmu++; \ - chi_p = &buf[SE->_offset]; \ - multLink(Uchi, U[sU], *chi_p, Dir); \ + chi = coalescedRead(buf[SE->_offset],lane); \ + multLink(Uchi, U[sU], chi, Dir); \ } template @@ -84,12 +77,14 @@ void StaggeredKernels::DhopSiteGeneric(StencilView &st, SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dag) { - const SiteSpinor *chi_p; - SiteSpinor chi; - SiteSpinor Uchi; + typedef decltype(coalescedRead(in[0])) calcSpinor; + calcSpinor chi; + calcSpinor Uchi; StencilEntry *SE; int ptype; int skew; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); // for(int s=0;s::DhopSiteGeneric(StencilView &st, if ( dag ) { Uchi = - Uchi; } - vstream(out[sF], Uchi); + coalescedWrite(out[sF], Uchi,lane); } }; @@ -130,13 +125,16 @@ template accelerator_inline void StaggeredKernels::DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, - const FermionFieldView &in, FermionFieldView &out,int dag) { - const SiteSpinor *chi_p; - SiteSpinor chi; - SiteSpinor Uchi; + const FermionFieldView &in, FermionFieldView &out,int dag) +{ + typedef decltype(coalescedRead(in[0])) calcSpinor; + calcSpinor chi; + calcSpinor Uchi; StencilEntry *SE; int ptype; int skew ; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); // for(int s=0;s::DhopSiteGenericInt(StencilView &st, if ( dag ) { Uchi = - Uchi; } - vstream(out[sF], Uchi); + coalescedWrite(out[sF], Uchi,lane); } }; @@ -178,14 +176,17 @@ template accelerator_inline void StaggeredKernels::DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, - const FermionFieldView &in, FermionFieldView &out,int dag) { - const SiteSpinor *chi_p; - // SiteSpinor chi; - SiteSpinor Uchi; + const FermionFieldView &in, FermionFieldView &out,int dag) +{ + typedef decltype(coalescedRead(in[0])) calcSpinor; + calcSpinor chi; + calcSpinor Uchi; StencilEntry *SE; int ptype; int nmu=0; int skew ; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); // for(int s=0;s::DhopSiteGenericExt(StencilView &st, GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd); } - if ( nmu ) { - if ( dag ) { - out[sF] = out[sF] - Uchi; + if ( nmu ) { + auto _out = coalescedRead(out[sF],lane); + if ( dag ) { + coalescedWrite(out[sF], _out-Uchi,lane); } else { - out[sF] = out[sF] + Uchi; + coalescedWrite(out[sF], _out+Uchi,lane); } } } @@ -261,6 +263,8 @@ void StaggeredKernels::DhopImproved(StencilImpl &st, LebesgueOrder &lo, GridBase *FGrid=in.Grid(); GridBase *UGrid=U.Grid(); typedef StaggeredKernels ThisKernel; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); autoView( UUU_v , UUU, AcceleratorRead); autoView( U_v , U, AcceleratorRead); autoView( in_v , in, AcceleratorRead); @@ -301,6 +305,8 @@ void StaggeredKernels::DhopNaive(StencilImpl &st, LebesgueOrder &lo, GridBase *FGrid=in.Grid(); GridBase *UGrid=U.Grid(); typedef StaggeredKernels ThisKernel; + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); autoView( UUU_v , U, AcceleratorRead); autoView( U_v , U, AcceleratorRead); autoView( in_v , in, AcceleratorRead); diff --git a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h index df1bce7c..3032a80c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonCloverFermionImplementation.h @@ -92,20 +92,16 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) int lvol = _Umu.Grid()->lSites(); int DimRep = Impl::Dimension; - Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); - Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); - - Coordinate lcoor; - typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); - { autoView(CTv,CloverTerm,CpuRead); autoView(CTIv,CloverTermInv,CpuWrite); - for (int site = 0; site < lvol; site++) { + thread_for(site, lvol, { + Coordinate lcoor; grid->LocalIndexToLocalCoor(site, lcoor); - EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); + typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero(); peekLocalSite(Qx, CTv, lcoor); - Qxinv = Zero(); //if (csw!=0){ for (int j = 0; j < Ns; j++) for (int k = 0; k < Ns; k++) @@ -126,21 +122,21 @@ void WilsonCloverFermion::ImportGauge(const GaugeField &_Umu) // if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl; // } pokeLocalSite(Qxinv, CTIv, lcoor); - } + }); } // Separate the even and odd parts pickCheckerboard(Even, CloverTermEven, CloverTerm); pickCheckerboard(Odd, CloverTermOdd, CloverTerm); - pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm))); - pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm))); + pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm)); + pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm)); pickCheckerboard(Even, CloverTermInvEven, CloverTermInv); pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv); - pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv))); - pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv))); + pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv)); + pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv)); } template diff --git a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h index 4977ea68..84ac25c1 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonFermionImplementation.h @@ -397,6 +397,7 @@ void WilsonFermion::DhopDerivEO(GaugeField &mat, const FermionField &U, co template void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int dag) { + DhopCalls+=2; conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), out.Grid()); @@ -408,6 +409,7 @@ void WilsonFermion::Dhop(const FermionField &in, FermionField &out, int da template void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int dag) { + DhopCalls++; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check @@ -420,6 +422,7 @@ void WilsonFermion::DhopOE(const FermionField &in, FermionField &out, int template void WilsonFermion::DhopEO(const FermionField &in, FermionField &out,int dag) { + DhopCalls++; conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), out.Grid()); // drops the cb check diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h index 2e587dfa..ffec05a0 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h @@ -38,9 +38,6 @@ Author: Nils Meyer Regensburg University // undefine everything related to kernels #include -// enable A64FX body -#define WILSONKERNELSASMBODYA64FX -//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h") /////////////////////////////////////////////////////////// // If we are A64FX specialise the single precision routine @@ -63,119 +60,89 @@ Author: Nils Meyer Regensburg University #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + ///////////////////////////////////////////////////////////////// @@ -185,119 +152,89 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + // undefine @@ -330,119 +267,89 @@ WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + ///////////////////////////////////////////////////////////////// // XYZT vectorised, dag Kernel, double @@ -451,124 +358,93 @@ WilsonKernels::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV #define INTERIOR_AND_EXTERIOR #undef INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #define INTERIOR #undef EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + #undef INTERIOR_AND_EXTERIOR #undef INTERIOR #define EXTERIOR + +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif +#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void WilsonKernels::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) -#if defined (WILSONKERNELSASMBODYA64FX) #include -#else -#include -#endif + // undefs -#undef WILSONKERNELSASMBODYA64FX #include #endif //A64FXASM diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h index 406e5c25..4e463438 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h @@ -25,6 +25,11 @@ Author: Nils Meyer Regensburg University See the full license in the file "LICENSE" in the top level distribution directory *************************************************************************************/ /* END LEGAL */ + +// GCC 10 messes up SVE instruction scheduling using -O3, but +// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders +// performance now is better than armclang 20.2 + #ifdef KERNEL_DAG #define DIR0_PROJ XP_PROJ #define DIR1_PROJ YP_PROJ @@ -97,7 +102,7 @@ Author: Nils Meyer Regensburg University PROJ; \ MAYBEPERM(PERMUTE_DIR,perm); \ } else { \ - LOAD_CHI(base); \ + LOAD_CHI(base); \ } \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ MULT_2SPIN_1(Dir); \ @@ -110,6 +115,11 @@ Author: Nils Meyer Regensburg University } \ RECON; \ +/* +NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty + though I expected that it would improve on performance +*/ + #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ PREFETCH1_CHIMU(base); \ @@ -126,73 +136,63 @@ Author: Nils Meyer Regensburg University #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ basep = st.GetPFInfo(nent,plocal); nent++; \ - if ( local ) { \ - LOAD_CHIMU(base); \ - LOAD_TABLE(PERMUTE_DIR); \ - PROJ; \ - MAYBEPERM(PERMUTE_DIR,perm); \ - }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ - base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ - if ( local || st.same_node[Dir] ) { \ - MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ - MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ - RECON; \ - PREFETCH_CHIMU_L2(basep); \ - } else { PREFETCH_CHIMU(base); } \ + if ( local ) { \ + LOAD_CHIMU(base); \ + LOAD_TABLE(PERMUTE_DIR); \ + PROJ; \ + MAYBEPERM(PERMUTE_DIR,perm); \ + }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ + if ( local || st.same_node[Dir] ) { \ + MULT_2SPIN_1(Dir); \ + MULT_2SPIN_2; \ + RECON; \ + } \ + base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ + PREFETCH_CHIMU(base); \ + PREFETCH_CHIMU_L2(basep); \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ PREFETCH1_CHIMU(base); \ + { ZERO_PSI; } \ ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) #define RESULT(base,basep) SAVE_RESULT(base,basep); #endif + //////////////////////////////////////////////////////////////////////////////// // Post comms kernel //////////////////////////////////////////////////////////////////////////////// #ifdef EXTERIOR - #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ - if((!local)&&(!st.same_node[Dir]) ) { \ - LOAD_CHI(base); \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ - RECON; \ - nmu++; \ + RECON; \ + nmu++; \ } -#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ - nmu=0; \ - base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\ - if((!local)&&(!st.same_node[Dir]) ) { \ - LOAD_CHI(base); \ +#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ + nmu=0; \ + { ZERO_PSI;} \ + base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ + if((!local)&&(!st.same_node[Dir]) ) { \ + LOAD_CHI(base); \ MULT_2SPIN_1(Dir); \ - PREFETCH_CHIMU(base); \ - /* PREFETCH_GAUGE_L1(NxtDir); */ \ MULT_2SPIN_2; \ - if (s == 0) { \ - if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \ - } \ - RECON; \ - nmu++; \ + RECON; \ + nmu++; \ } #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} #endif + + { int nmu; int local,perm, ptype; @@ -209,7 +209,6 @@ Author: Nils Meyer Regensburg University int ssn=ssU+1; if(ssn>=nmax) ssn=0; // int sUn=lo.Reorder(ssn); int sUn=ssn; - LOCK_GAUGE(0); #else int sU =ssU; int ssn=ssU+1; if(ssn>=nmax) ssn=0; @@ -295,6 +294,11 @@ Author: Nils Meyer Regensburg University std::cout << "----------------------------------------------------" << std::endl; #endif + // DC ZVA test + // { uint64_t basestore = (uint64_t)&out[ss]; + // PREFETCH_RESULT_L2_STORE(basestore); } + + ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON); #ifdef SHOW @@ -308,6 +312,11 @@ Author: Nils Meyer Regensburg University std::cout << "----------------------------------------------------" << std::endl; #endif + // DC ZVA test + //{ uint64_t basestore = (uint64_t)&out[ss]; + // PREFETCH_RESULT_L2_STORE(basestore); } + + ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON); #ifdef SHOW @@ -321,6 +330,11 @@ Author: Nils Meyer Regensburg University std::cout << "----------------------------------------------------" << std::endl; #endif + // DC ZVA test + //{ uint64_t basestore = (uint64_t)&out[ss]; + // PREFETCH_RESULT_L2_STORE(basestore); } + + ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON); #ifdef SHOW @@ -341,6 +355,7 @@ Author: Nils Meyer Regensburg University base = (uint64_t) &out[ss]; basep= st.GetPFInfo(nent,plocal); ent++; basep = (uint64_t) &out[ssn]; + //PREFETCH_RESULT_L1_STORE(base); RESULT(base,basep); #ifdef SHOW diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h index 89ae5668..0703b613 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h @@ -76,7 +76,24 @@ Author: paboyle #define REGISTER -#define LOAD_CHIMU \ +#ifdef GRID_SIMT +#define LOAD_CHIMU(ptype) \ + {const SiteSpinor & ref (in[offset]); \ + Chimu_00=coalescedReadPermute(ref()(0)(0),perm,lane); \ + Chimu_01=coalescedReadPermute(ref()(0)(1),perm,lane); \ + Chimu_02=coalescedReadPermute(ref()(0)(2),perm,lane); \ + Chimu_10=coalescedReadPermute(ref()(1)(0),perm,lane); \ + Chimu_11=coalescedReadPermute(ref()(1)(1),perm,lane); \ + Chimu_12=coalescedReadPermute(ref()(1)(2),perm,lane); \ + Chimu_20=coalescedReadPermute(ref()(2)(0),perm,lane); \ + Chimu_21=coalescedReadPermute(ref()(2)(1),perm,lane); \ + Chimu_22=coalescedReadPermute(ref()(2)(2),perm,lane); \ + Chimu_30=coalescedReadPermute(ref()(3)(0),perm,lane); \ + Chimu_31=coalescedReadPermute(ref()(3)(1),perm,lane); \ + Chimu_32=coalescedReadPermute(ref()(3)(2),perm,lane); } +#define PERMUTE_DIR(dir) ; +#else +#define LOAD_CHIMU(ptype) \ {const SiteSpinor & ref (in[offset]); \ Chimu_00=ref()(0)(0);\ Chimu_01=ref()(0)(1);\ @@ -91,55 +108,55 @@ Author: paboyle Chimu_31=ref()(3)(1);\ Chimu_32=ref()(3)(2);} -#define LOAD_CHI\ - {const SiteHalfSpinor &ref(buf[offset]); \ - Chi_00 = ref()(0)(0);\ - Chi_01 = ref()(0)(1);\ - Chi_02 = ref()(0)(2);\ - Chi_10 = ref()(1)(0);\ - Chi_11 = ref()(1)(1);\ - Chi_12 = ref()(1)(2);} - -// To splat or not to splat depends on the implementation -#define MULT_2SPIN(A)\ - {auto & ref(U[sU](A)); \ - Impl::loadLinkElement(U_00,ref()(0,0)); \ - Impl::loadLinkElement(U_10,ref()(1,0)); \ - Impl::loadLinkElement(U_20,ref()(2,0)); \ - Impl::loadLinkElement(U_01,ref()(0,1)); \ - Impl::loadLinkElement(U_11,ref()(1,1)); \ - Impl::loadLinkElement(U_21,ref()(2,1)); \ - UChi_00 = U_00*Chi_00;\ - UChi_10 = U_00*Chi_10;\ - UChi_01 = U_10*Chi_00;\ - UChi_11 = U_10*Chi_10;\ - UChi_02 = U_20*Chi_00;\ - UChi_12 = U_20*Chi_10;\ - UChi_00+= U_01*Chi_01;\ - UChi_10+= U_01*Chi_11;\ - UChi_01+= U_11*Chi_01;\ - UChi_11+= U_11*Chi_11;\ - UChi_02+= U_21*Chi_01;\ - UChi_12+= U_21*Chi_11;\ - Impl::loadLinkElement(U_00,ref()(0,2)); \ - Impl::loadLinkElement(U_10,ref()(1,2)); \ - Impl::loadLinkElement(U_20,ref()(2,2)); \ - UChi_00+= U_00*Chi_02;\ - UChi_10+= U_00*Chi_12;\ - UChi_01+= U_10*Chi_02;\ - UChi_11+= U_10*Chi_12;\ - UChi_02+= U_20*Chi_02;\ - UChi_12+= U_20*Chi_12;} - - #define PERMUTE_DIR(dir) \ - permute##dir(Chi_00,Chi_00);\ + permute##dir(Chi_00,Chi_00); \ permute##dir(Chi_01,Chi_01);\ permute##dir(Chi_02,Chi_02);\ - permute##dir(Chi_10,Chi_10);\ + permute##dir(Chi_10,Chi_10); \ permute##dir(Chi_11,Chi_11);\ permute##dir(Chi_12,Chi_12); +#endif + +#define MULT_2SPIN(A)\ + {auto & ref(U[sU](A)); \ + U_00=coalescedRead(ref()(0,0),lane); \ + U_10=coalescedRead(ref()(1,0),lane); \ + U_20=coalescedRead(ref()(2,0),lane); \ + U_01=coalescedRead(ref()(0,1),lane); \ + U_11=coalescedRead(ref()(1,1),lane); \ + U_21=coalescedRead(ref()(2,1),lane); \ + UChi_00 = U_00*Chi_00; \ + UChi_10 = U_00*Chi_10; \ + UChi_01 = U_10*Chi_00; \ + UChi_11 = U_10*Chi_10; \ + UChi_02 = U_20*Chi_00; \ + UChi_12 = U_20*Chi_10; \ + UChi_00+= U_01*Chi_01; \ + UChi_10+= U_01*Chi_11; \ + UChi_01+= U_11*Chi_01; \ + UChi_11+= U_11*Chi_11; \ + UChi_02+= U_21*Chi_01; \ + UChi_12+= U_21*Chi_11; \ + U_00=coalescedRead(ref()(0,2),lane); \ + U_10=coalescedRead(ref()(1,2),lane); \ + U_20=coalescedRead(ref()(2,2),lane); \ + UChi_00+= U_00*Chi_02; \ + UChi_10+= U_00*Chi_12; \ + UChi_01+= U_10*Chi_02; \ + UChi_11+= U_10*Chi_12; \ + UChi_02+= U_20*Chi_02; \ + UChi_12+= U_20*Chi_12;} + +#define LOAD_CHI \ + {const SiteHalfSpinor &ref(buf[offset]); \ + Chi_00 = coalescedRead(ref()(0)(0),lane); \ + Chi_01 = coalescedRead(ref()(0)(1),lane); \ + Chi_02 = coalescedRead(ref()(0)(2),lane); \ + Chi_10 = coalescedRead(ref()(1)(0),lane); \ + Chi_11 = coalescedRead(ref()(1)(1),lane); \ + Chi_12 = coalescedRead(ref()(1)(2),lane);} + // hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(1)=fspin(1)+timesI(fspin(2)); #define XP_PROJ \ @@ -353,13 +370,13 @@ Author: paboyle result_31-= UChi_11; \ result_32-= UChi_12; -#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ +#define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON) \ SE=st.GetEntry(ptype,DIR,ss); \ offset = SE->_offset; \ local = SE->_is_local; \ perm = SE->_permute; \ if ( local ) { \ - LOAD_CHIMU; \ + LOAD_CHIMU(PERM); \ PROJ; \ if ( perm) { \ PERMUTE_DIR(PERM); \ @@ -367,6 +384,37 @@ Author: paboyle } else { \ LOAD_CHI; \ } \ + acceleratorSynchronise(); \ + MULT_2SPIN(DIR); \ + RECON; + +#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \ + SE=&st_p[DIR+8*ss]; \ + ptype=st_perm[DIR]; \ + offset = SE->_offset; \ + local = SE->_is_local; \ + perm = SE->_permute; \ + if ( local ) { \ + LOAD_CHIMU(PERM); \ + PROJ; \ + if ( perm) { \ + PERMUTE_DIR(PERM); \ + } \ + } else { \ + LOAD_CHI; \ + } \ + acceleratorSynchronise(); \ + MULT_2SPIN(DIR); \ + RECON; + +#define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON) \ + SE=&st_p[DIR+8*ss]; \ + ptype=st_perm[DIR]; \ + /*SE=st.GetEntry(ptype,DIR,ss);*/ \ + offset = SE->_offset; \ + perm = SE->_permute; \ + LOAD_CHIMU(PERM); \ + PROJ; \ MULT_2SPIN(DIR); \ RECON; @@ -376,7 +424,7 @@ Author: paboyle local = SE->_is_local; \ perm = SE->_permute; \ if ( local ) { \ - LOAD_CHIMU; \ + LOAD_CHIMU(PERM); \ PROJ; \ if ( perm) { \ PERMUTE_DIR(PERM); \ @@ -384,10 +432,12 @@ Author: paboyle } else if ( st.same_node[DIR] ) { \ LOAD_CHI; \ } \ + acceleratorSynchronise(); \ if (local || st.same_node[DIR] ) { \ MULT_2SPIN(DIR); \ RECON; \ - } + } \ + acceleratorSynchronise(); #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ SE=st.GetEntry(ptype,DIR,ss); \ @@ -397,44 +447,44 @@ Author: paboyle MULT_2SPIN(DIR); \ RECON; \ nmu++; \ - } + } \ + acceleratorSynchronise(); #define HAND_RESULT(ss) \ { \ - SiteSpinor & ref (out[ss]); \ - vstream(ref()(0)(0),result_00); \ - vstream(ref()(0)(1),result_01); \ - vstream(ref()(0)(2),result_02); \ - vstream(ref()(1)(0),result_10); \ - vstream(ref()(1)(1),result_11); \ - vstream(ref()(1)(2),result_12); \ - vstream(ref()(2)(0),result_20); \ - vstream(ref()(2)(1),result_21); \ - vstream(ref()(2)(2),result_22); \ - vstream(ref()(3)(0),result_30); \ - vstream(ref()(3)(1),result_31); \ - vstream(ref()(3)(2),result_32); \ + SiteSpinor & ref (out[ss]); \ + coalescedWrite(ref()(0)(0),result_00,lane); \ + coalescedWrite(ref()(0)(1),result_01,lane); \ + coalescedWrite(ref()(0)(2),result_02,lane); \ + coalescedWrite(ref()(1)(0),result_10,lane); \ + coalescedWrite(ref()(1)(1),result_11,lane); \ + coalescedWrite(ref()(1)(2),result_12,lane); \ + coalescedWrite(ref()(2)(0),result_20,lane); \ + coalescedWrite(ref()(2)(1),result_21,lane); \ + coalescedWrite(ref()(2)(2),result_22,lane); \ + coalescedWrite(ref()(3)(0),result_30,lane); \ + coalescedWrite(ref()(3)(1),result_31,lane); \ + coalescedWrite(ref()(3)(2),result_32,lane); \ } -#define HAND_RESULT_EXT(ss) \ - if (nmu){ \ - SiteSpinor & ref (out[ss]); \ - ref()(0)(0)+=result_00; \ - ref()(0)(1)+=result_01; \ - ref()(0)(2)+=result_02; \ - ref()(1)(0)+=result_10; \ - ref()(1)(1)+=result_11; \ - ref()(1)(2)+=result_12; \ - ref()(2)(0)+=result_20; \ - ref()(2)(1)+=result_21; \ - ref()(2)(2)+=result_22; \ - ref()(3)(0)+=result_30; \ - ref()(3)(1)+=result_31; \ - ref()(3)(2)+=result_32; \ +#define HAND_RESULT_EXT(ss) \ + { \ + SiteSpinor & ref (out[ss]); \ + coalescedWrite(ref()(0)(0),coalescedRead(ref()(0)(0))+result_00,lane); \ + coalescedWrite(ref()(0)(1),coalescedRead(ref()(0)(1))+result_01,lane); \ + coalescedWrite(ref()(0)(2),coalescedRead(ref()(0)(2))+result_02,lane); \ + coalescedWrite(ref()(1)(0),coalescedRead(ref()(1)(0))+result_10,lane); \ + coalescedWrite(ref()(1)(1),coalescedRead(ref()(1)(1))+result_11,lane); \ + coalescedWrite(ref()(1)(2),coalescedRead(ref()(1)(2))+result_12,lane); \ + coalescedWrite(ref()(2)(0),coalescedRead(ref()(2)(0))+result_20,lane); \ + coalescedWrite(ref()(2)(1),coalescedRead(ref()(2)(1))+result_21,lane); \ + coalescedWrite(ref()(2)(2),coalescedRead(ref()(2)(2))+result_22,lane); \ + coalescedWrite(ref()(3)(0),coalescedRead(ref()(3)(0))+result_30,lane); \ + coalescedWrite(ref()(3)(1),coalescedRead(ref()(3)(1))+result_31,lane); \ + coalescedWrite(ref()(3)(2),coalescedRead(ref()(3)(2))+result_32,lane); \ } - -#define HAND_DECLARATIONS(a) \ +#define HAND_DECLARATIONS(Simd) \ Simd result_00; \ Simd result_01; \ Simd result_02; \ @@ -466,19 +516,19 @@ Author: paboyle Simd U_11; \ Simd U_21; -#define ZERO_RESULT \ - result_00=Zero(); \ - result_01=Zero(); \ - result_02=Zero(); \ - result_10=Zero(); \ - result_11=Zero(); \ - result_12=Zero(); \ - result_20=Zero(); \ - result_21=Zero(); \ - result_22=Zero(); \ - result_30=Zero(); \ - result_31=Zero(); \ - result_32=Zero(); +#define ZERO_RESULT \ + zeroit(result_00); \ + zeroit(result_01); \ + zeroit(result_02); \ + zeroit(result_10); \ + zeroit(result_11); \ + zeroit(result_12); \ + zeroit(result_20); \ + zeroit(result_21); \ + zeroit(result_22); \ + zeroit(result_30); \ + zeroit(result_31); \ + zeroit(result_32); #define Chimu_00 Chi_00 #define Chimu_01 Chi_01 @@ -495,15 +545,53 @@ Author: paboyle NAMESPACE_BEGIN(Grid); + +#ifdef SYCL_HACK template accelerator_inline void -WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, - int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +WilsonKernels::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf, + int ss,int sU,const SiteSpinor *in, SiteSpinor *out) { // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; + typedef iSinglet vCplx; + // typedef decltype( coalescedRead( vCplx()()() )) Simt; + typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; - HAND_DECLARATIONS(ignore); + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + + HAND_DECLARATIONS(Simt); + + int offset,local,perm, ptype; + StencilEntry *SE; + HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); + HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); + HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); + HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM); + HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM); + HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM); + HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM); + HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM); + HAND_RESULT(ss); +} +#endif + +template accelerator_inline void +WilsonKernels::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, + int ss,int sU,const FermionFieldView &in, FermionFieldView &out) +{ + auto st_p = st._entries_p; + auto st_perm = st._permute_type; +// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... + typedef typename Simd::scalar_type S; + typedef typename Simd::vector_type V; + typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; + + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + + HAND_DECLARATIONS(Simt); int offset,local,perm, ptype; StencilEntry *SE; @@ -523,10 +611,16 @@ template accelerator_inline void WilsonKernels::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { + auto st_p = st._entries_p; + auto st_perm = st._permute_type; typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; + typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; - HAND_DECLARATIONS(ignore); + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + + HAND_DECLARATIONS(Simt); StencilEntry *SE; int offset,local,perm, ptype; @@ -546,11 +640,17 @@ template accelerator_inline void WilsonKernels::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { + auto st_p = st._entries_p; + auto st_perm = st._permute_type; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; + typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; - HAND_DECLARATIONS(ignore); + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + + HAND_DECLARATIONS(Simt); int offset,local,perm, ptype; StencilEntry *SE; @@ -570,10 +670,16 @@ template accelerator_inline void WilsonKernels::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { + auto st_p = st._entries_p; + auto st_perm = st._permute_type; typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; + typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; - HAND_DECLARATIONS(ignore); + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + + HAND_DECLARATIONS(Simt); StencilEntry *SE; int offset,local,perm, ptype; @@ -593,11 +699,17 @@ template accelerator_inline void WilsonKernels::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { + auto st_p = st._entries_p; + auto st_perm = st._permute_type; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; + typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; - HAND_DECLARATIONS(ignore); + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + + HAND_DECLARATIONS(Simt); int offset, ptype; StencilEntry *SE; @@ -618,10 +730,16 @@ template accelerator_inline void WilsonKernels::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int ss,int sU,const FermionFieldView &in, FermionFieldView &out) { + auto st_p = st._entries_p; + auto st_perm = st._permute_type; typedef typename Simd::scalar_type S; typedef typename Simd::vector_type V; + typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt; - HAND_DECLARATIONS(ignore); + const int Nsimd = SiteHalfSpinor::Nsimd(); + const int lane=acceleratorSIMTlane(Nsimd); + + HAND_DECLARATIONS(Simt); StencilEntry *SE; int offset, ptype; @@ -682,3 +800,4 @@ NAMESPACE_END(Grid); #undef HAND_RESULT #undef HAND_RESULT_INT #undef HAND_RESULT_EXT +#undef HAND_DECLARATIONS diff --git a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h index c5f50bbb..9228b84c 100644 --- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h +++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h @@ -416,7 +416,21 @@ void WilsonKernels::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S #undef LoopBody } -#define KERNEL_CALLNB(A) \ +#define KERNEL_CALL_TMP(A) \ + const uint64_t NN = Nsite*Ls; \ + auto U_p = & U_v[0]; \ + auto in_p = & in_v[0]; \ + auto out_p = & out_v[0]; \ + auto st_p = st_v._entries_p; \ + auto st_perm = st_v._permute_type; \ + accelerator_forNB( ss, NN, Simd::Nsimd(), { \ + int sF = ss; \ + int sU = ss/Ls; \ + WilsonKernels::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p); \ + }); \ + accelerator_barrier(); + +#define KERNEL_CALLNB(A) \ const uint64_t NN = Nsite*Ls; \ accelerator_forNB( ss, NN, Simd::Nsimd(), { \ int sF = ss; \ @@ -445,20 +459,24 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} -#ifndef GRID_CUDA +#ifdef SYCL_HACK + if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl); return; } +#else if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} +#endif +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} -#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;} +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} -#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} #endif } @@ -476,20 +494,20 @@ void WilsonKernels::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField if( interior && exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} -#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;} +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} #endif } else if( interior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} -#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} #endif } else if( exterior ) { if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} -#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;} +#ifndef GRID_CUDA if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} #endif } diff --git a/Grid/qcd/action/gauge/Gauge.cc b/Grid/qcd/action/gauge/Gauge.cc new file mode 100644 index 00000000..2b5e2691 --- /dev/null +++ b/Grid/qcd/action/gauge/Gauge.cc @@ -0,0 +1,38 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/action/gauge/Gauge.cc + +Copyright (C) 2020 + +Author: Peter Boyle +Author: Peter Boyle +Author: paboyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +#include + +NAMESPACE_BEGIN(Grid); + +std::vector ConjugateGaugeImplBase::_conjDirs; + +NAMESPACE_END(Grid); + diff --git a/Grid/qcd/action/gauge/GaugeImplTypes.h b/Grid/qcd/action/gauge/GaugeImplTypes.h index 9b7d5a60..2499e0e9 100644 --- a/Grid/qcd/action/gauge/GaugeImplTypes.h +++ b/Grid/qcd/action/gauge/GaugeImplTypes.h @@ -96,7 +96,7 @@ public: /////////////////////////////////////////////////////////// // Move these to another class // HMC auxiliary functions - static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) + static inline void generate_momenta(Field &P, GridSerialRNG & sRNG, GridParallelRNG &pRNG) { // Zbigniew Srocinsky thesis: // @@ -154,6 +154,10 @@ public: return Hsum.real(); } + static inline void Project(Field &U) { + ProjectSUn(U); + } + static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { SU::HotConfiguration(pRNG, U); } diff --git a/Grid/qcd/action/gauge/GaugeImplementations.h b/Grid/qcd/action/gauge/GaugeImplementations.h index a14aec1b..16147c77 100644 --- a/Grid/qcd/action/gauge/GaugeImplementations.h +++ b/Grid/qcd/action/gauge/GaugeImplementations.h @@ -59,14 +59,14 @@ public: } static inline GaugeLinkField CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) { - return Cshift(adj(Link), mu, -1); + return PeriodicBC::CovShiftIdentityBackward(Link, mu); } static inline GaugeLinkField CovShiftIdentityForward(const GaugeLinkField &Link, int mu) { - return Link; + return PeriodicBC::CovShiftIdentityForward(Link,mu); } static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) { - return Cshift(Link, mu, 1); + return PeriodicBC::ShiftStaple(Link,mu); } static inline bool isPeriodicGaugeField(void) { return true; } @@ -74,7 +74,13 @@ public: // Composition with smeared link, bc's etc.. probably need multiple inheritance // Variable precision "S" and variable Nc -template class ConjugateGaugeImpl : public GimplTypes { +class ConjugateGaugeImplBase { +protected: + static std::vector _conjDirs; +}; + + template class ConjugateGaugeImpl : public GimplTypes, ConjugateGaugeImplBase { +private: public: INHERIT_GIMPL_TYPES(GimplTypes); @@ -84,47 +90,56 @@ public: //////////////////////////////////////////////////////////////////////////////////////////////////////////// template static Lattice CovShiftForward(const GaugeLinkField &Link, int mu, - const Lattice &field) { - return ConjugateBC::CovShiftForward(Link, mu, field); + const Lattice &field) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::CovShiftForward(Link, mu, field); + else + return PeriodicBC::CovShiftForward(Link, mu, field); } template static Lattice CovShiftBackward(const GaugeLinkField &Link, int mu, - const Lattice &field) { - return ConjugateBC::CovShiftBackward(Link, mu, field); + const Lattice &field) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::CovShiftBackward(Link, mu, field); + else + return PeriodicBC::CovShiftBackward(Link, mu, field); } static inline GaugeLinkField - CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) { - GridBase *grid = Link.Grid(); - int Lmu = grid->GlobalDimensions()[mu] - 1; - - Lattice> coor(grid); - LatticeCoordinate(coor, mu); - - GaugeLinkField tmp(grid); - tmp = adj(Link); - tmp = where(coor == Lmu, conjugate(tmp), tmp); - return Cshift(tmp, mu, -1); // moves towards positive mu + CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::CovShiftIdentityBackward(Link, mu); + else + return PeriodicBC::CovShiftIdentityBackward(Link, mu); } static inline GaugeLinkField - CovShiftIdentityForward(const GaugeLinkField &Link, int mu) { - return Link; + CovShiftIdentityForward(const GaugeLinkField &Link, int mu) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::CovShiftIdentityForward(Link,mu); + else + return PeriodicBC::CovShiftIdentityForward(Link,mu); } - static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) { - GridBase *grid = Link.Grid(); - int Lmu = grid->GlobalDimensions()[mu] - 1; - - Lattice> coor(grid); - LatticeCoordinate(coor, mu); - - GaugeLinkField tmp(grid); - tmp = Cshift(Link, mu, 1); - tmp = where(coor == Lmu, conjugate(tmp), tmp); - return tmp; + static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) + { + assert(_conjDirs.size() == Nd); + if(_conjDirs[mu]) + return ConjugateBC::ShiftStaple(Link,mu); + else + return PeriodicBC::ShiftStaple(Link,mu); } + static inline void setDirections(std::vector &conjDirs) { _conjDirs=conjDirs; } + static inline std::vector getDirections(void) { return _conjDirs; } static inline bool isPeriodicGaugeField(void) { return false; } }; diff --git a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h index 639aca19..7690092d 100644 --- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h +++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h @@ -49,7 +49,7 @@ public: virtual std::string action_name(){return "PlaqPlusRectangleAction";} - virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {}; // noop as no pseudoferms + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {}; // noop as no pseudoferms virtual std::string LogParameters(){ std::stringstream sstream; diff --git a/Grid/qcd/action/gauge/WilsonGaugeAction.h b/Grid/qcd/action/gauge/WilsonGaugeAction.h index 40d600d2..f535b54f 100644 --- a/Grid/qcd/action/gauge/WilsonGaugeAction.h +++ b/Grid/qcd/action/gauge/WilsonGaugeAction.h @@ -54,8 +54,7 @@ public: return sstream.str(); } - virtual void refresh(const GaugeField &U, - GridParallelRNG &pRNG){}; // noop as no pseudoferms + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){}; // noop as no pseudoferms virtual RealD S(const GaugeField &U) { RealD plaq = WilsonLoops::avgPlaquette(U); diff --git a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h index 9fc0a3b0..576a8cf6 100644 --- a/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h +++ b/Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h @@ -124,7 +124,7 @@ NAMESPACE_BEGIN(Grid); // // As a check of rational require \Phi^dag M_{EOFA} \Phi == eta^dag M^-1/2^dag M M^-1/2 eta = eta^dag eta // - virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) + virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { Lop.ImportGauge(U); Rop.ImportGauge(U); diff --git a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h index 56dff94d..656e9b2f 100644 --- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h +++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h @@ -1,4 +1,3 @@ - /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -43,8 +42,7 @@ NAMESPACE_BEGIN(Grid); // template -class OneFlavourEvenOddRationalPseudoFermionAction - : public Action { +class OneFlavourEvenOddRationalPseudoFermionAction : public Action { public: INHERIT_IMPL_TYPES(Impl); @@ -103,7 +101,7 @@ public: return sstream.str(); } - virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) { // P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi} // = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi} // Phi = MpcdagMpc^{1/4} eta @@ -156,7 +154,10 @@ public: msCG(Mpc, PhiOdd, Y); - if ( (rand()%param.BoundsCheckFreq)==0 ) { + auto grid = FermOp.FermionGrid(); + auto r=rand(); + grid->Broadcast(0,r); + if ( (r%param.BoundsCheckFreq)==0 ) { FermionField gauss(FermOp.FermionRedBlackGrid()); gauss = PhiOdd; HighBoundCheck(Mpc,gauss,param.hi); diff --git a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h index e5f0b602..e968b8e4 100644 --- a/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h +++ b/Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h @@ -101,7 +101,7 @@ NAMESPACE_BEGIN(Grid); } - virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi // @@ -170,7 +170,10 @@ NAMESPACE_BEGIN(Grid); msCG_M(MdagM,X,Y); // Randomly apply rational bounds checks. - if ( (rand()%param.BoundsCheckFreq)==0 ) { + auto grid = NumOp.FermionGrid(); + auto r=rand(); + grid->Broadcast(0,r); + if ( (r%param.BoundsCheckFreq)==0 ) { FermionField gauss(NumOp.FermionRedBlackGrid()); gauss = PhiOdd; HighBoundCheck(MdagM,gauss,param.hi); diff --git a/Grid/qcd/action/pseudofermion/OneFlavourRational.h b/Grid/qcd/action/pseudofermion/OneFlavourRational.h index f6c823c9..aa647445 100644 --- a/Grid/qcd/action/pseudofermion/OneFlavourRational.h +++ b/Grid/qcd/action/pseudofermion/OneFlavourRational.h @@ -98,7 +98,7 @@ NAMESPACE_BEGIN(Grid); - virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { // P(phi) = e^{- phi^dag (MdagM)^-1/2 phi} @@ -142,7 +142,10 @@ NAMESPACE_BEGIN(Grid); msCG(MdagMOp,Phi,Y); - if ( (rand()%param.BoundsCheckFreq)==0 ) { + auto grid = FermOp.FermionGrid(); + auto r=rand(); + grid->Broadcast(0,r); + if ( (r%param.BoundsCheckFreq)==0 ) { FermionField gauss(FermOp.FermionGrid()); gauss = Phi; HighBoundCheck(MdagMOp,gauss,param.hi); diff --git a/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h index 5fae2fe9..128c869a 100644 --- a/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h +++ b/Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h @@ -95,7 +95,7 @@ NAMESPACE_BEGIN(Grid); } - virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { // S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi // @@ -156,7 +156,10 @@ NAMESPACE_BEGIN(Grid); msCG_M(MdagM,X,Y); // Randomly apply rational bounds checks. - if ( (rand()%param.BoundsCheckFreq)==0 ) { + auto grid = NumOp.FermionGrid(); + auto r=rand(); + grid->Broadcast(0,r); + if ( (r%param.BoundsCheckFreq)==0 ) { FermionField gauss(NumOp.FermionGrid()); gauss = Phi; HighBoundCheck(MdagM,gauss,param.hi); diff --git a/Grid/qcd/action/pseudofermion/TwoFlavour.h b/Grid/qcd/action/pseudofermion/TwoFlavour.h index f905a675..2ac97ddd 100644 --- a/Grid/qcd/action/pseudofermion/TwoFlavour.h +++ b/Grid/qcd/action/pseudofermion/TwoFlavour.h @@ -73,7 +73,7 @@ public: ////////////////////////////////////////////////////////////////////////////////////// // Push the gauge field in to the dops. Assume any BC's and smearing already applied ////////////////////////////////////////////////////////////////////////////////////// - virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) { // P(phi) = e^{- phi^dag (MdagM)^-1 phi} // Phi = Mdag eta // P(eta) = e^{- eta^dag eta} diff --git a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h index a3cf8f08..2e5208a8 100644 --- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h +++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h @@ -77,7 +77,7 @@ public: ////////////////////////////////////////////////////////////////////////////////////// // Push the gauge field in to the dops. Assume any BC's and smearing already applied ////////////////////////////////////////////////////////////////////////////////////// - virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { // P(phi) = e^{- phi^dag (MpcdagMpc)^-1 phi} // Phi = McpDag eta diff --git a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h index d1d6f336..da628c75 100644 --- a/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h +++ b/Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h @@ -84,7 +84,7 @@ NAMESPACE_BEGIN(Grid); } - virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { // P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi} // diff --git a/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h b/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h index 4d72faba..f584706d 100644 --- a/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h +++ b/Grid/qcd/action/pseudofermion/TwoFlavourRatio.h @@ -64,7 +64,7 @@ public: return sstream.str(); } - virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) { + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { // P(phi) = e^{- phi^dag V (MdagM)^-1 Vdag phi} // diff --git a/Grid/qcd/action/scalar/ScalarAction.h b/Grid/qcd/action/scalar/ScalarAction.h index 34fc4fac..8b4f4f79 100644 --- a/Grid/qcd/action/scalar/ScalarAction.h +++ b/Grid/qcd/action/scalar/ScalarAction.h @@ -55,7 +55,7 @@ public: } virtual std::string action_name() {return "ScalarAction";} - virtual void refresh(const Field &U, GridParallelRNG &pRNG) {} // noop as no pseudoferms + virtual void refresh(const Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {} // noop as no pseudoferms virtual RealD S(const Field &p) { return (mass_square * 0.5 + Nd) * ScalarObs::sumphisquared(p) + diff --git a/Grid/qcd/action/scalar/ScalarImpl.h b/Grid/qcd/action/scalar/ScalarImpl.h index 14675b11..13bd6c90 100644 --- a/Grid/qcd/action/scalar/ScalarImpl.h +++ b/Grid/qcd/action/scalar/ScalarImpl.h @@ -27,7 +27,7 @@ public: typedef Field FermionField; typedef Field PropagatorField; - static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){ + static inline void generate_momenta(Field& P, GridSerialRNG &sRNG, GridParallelRNG& pRNG){ RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling gaussian(pRNG, P); P *= scale; @@ -54,6 +54,10 @@ public: static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { U = 1.0; } + + static inline void Project(Field &U) { + return; + } static void MomentumSpacePropagator(Field &out, RealD m) { @@ -147,7 +151,7 @@ public: out = one / out; } - static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) + static inline void generate_momenta(Field &P, GridSerialRNG & sRNG, GridParallelRNG &pRNG) { RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling #ifndef USE_FFT_ACCELERATION @@ -234,6 +238,10 @@ public: #endif //USE_FFT_ACCELERATION } + static inline void Project(Field &U) { + return; + } + static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U); } diff --git a/Grid/qcd/action/scalar/ScalarInteractionAction.h b/Grid/qcd/action/scalar/ScalarInteractionAction.h index 5a5f9251..e04dd486 100644 --- a/Grid/qcd/action/scalar/ScalarInteractionAction.h +++ b/Grid/qcd/action/scalar/ScalarInteractionAction.h @@ -77,7 +77,7 @@ public: virtual std::string action_name() { return "ScalarAction"; } - virtual void refresh(const Field &U, GridParallelRNG &pRNG) {} + virtual void refresh(const Field &U, GridSerialRNG & sRNG, GridParallelRNG &pRNG) {} virtual RealD S(const Field &p) { diff --git a/Grid/qcd/hmc/GenericHMCrunner.h b/Grid/qcd/hmc/GenericHMCrunner.h index c2443dd0..98e8175a 100644 --- a/Grid/qcd/hmc/GenericHMCrunner.h +++ b/Grid/qcd/hmc/GenericHMCrunner.h @@ -159,6 +159,13 @@ private: Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U, Resources.GetSerialRNG(), Resources.GetParallelRNG()); + } else { + // others + std::cout << GridLogError << "Unrecognized StartingType\n"; + std::cout + << GridLogError + << "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n"; + exit(1); } Smearing.set_Field(U); diff --git a/Grid/qcd/hmc/HMC.h b/Grid/qcd/hmc/HMC.h index 0f933204..44674ea5 100644 --- a/Grid/qcd/hmc/HMC.h +++ b/Grid/qcd/hmc/HMC.h @@ -95,7 +95,7 @@ private: typedef typename IntegratorType::Field Field; typedef std::vector< HmcObservable * > ObsListType; - + //pass these from the resource manager GridSerialRNG &sRNG; GridParallelRNG &pRNG; @@ -139,7 +139,7 @@ private: // Evolution ///////////////////////////////////////////////////////// RealD evolve_hmc_step(Field &U) { - TheIntegrator.refresh(U, pRNG); // set U and initialize P and phi's + TheIntegrator.refresh(U, sRNG, pRNG); // set U and initialize P and phi's RealD H0 = TheIntegrator.S(U); // initial state action diff --git a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h index 3cd05ebc..c09fdeeb 100644 --- a/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h +++ b/Grid/qcd/hmc/checkpointers/BaseCheckpointer.h @@ -74,7 +74,7 @@ public: conf_file = os.str(); } } - + virtual ~BaseHmcCheckpointer(){}; void check_filename(const std::string &filename){ std::ifstream f(filename.c_str()); if(!f.good()){ @@ -82,7 +82,6 @@ public: abort(); }; } - virtual void initialize(const CheckpointerParameters &Params) = 0; virtual void CheckpointRestore(int traj, typename Impl::Field &U, diff --git a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h index 269caa6e..1bb8aa1a 100644 --- a/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h +++ b/Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h @@ -45,6 +45,7 @@ private: public: INHERIT_GIMPL_TYPES(Implementation); + typedef GaugeStatistics GaugeStats; ILDGHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); } @@ -78,7 +79,7 @@ public: BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb); IldgWriter _IldgWriter(grid->IsBoss()); _IldgWriter.open(config); - _IldgWriter.writeConfiguration(U, traj, config, config); + _IldgWriter.writeConfiguration(U, traj, config, config); _IldgWriter.close(); std::cout << GridLogMessage << "Written ILDG Configuration on " << config @@ -105,7 +106,7 @@ public: FieldMetaData header; IldgReader _IldgReader; _IldgReader.open(config); - _IldgReader.readConfiguration(U,header); // format from the header + _IldgReader.readConfiguration(U,header); // format from the header _IldgReader.close(); std::cout << GridLogMessage << "Read ILDG Configuration from " << config diff --git a/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h index cfcc44d8..4534e4c4 100644 --- a/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h +++ b/Grid/qcd/hmc/checkpointers/NerscCheckpointer.h @@ -43,7 +43,8 @@ private: public: INHERIT_GIMPL_TYPES(Gimpl); // only for gauge configurations - + typedef GaugeStatistics GaugeStats; + NerscHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); } void initialize(const CheckpointerParameters &Params_) { @@ -60,7 +61,7 @@ public: int precision32 = 1; int tworow = 0; NerscIO::writeRNGState(sRNG, pRNG, rng); - NerscIO::writeConfiguration(U, config, tworow, precision32); + NerscIO::writeConfiguration(U, config, tworow, precision32); } }; @@ -74,7 +75,7 @@ public: FieldMetaData header; NerscIO::readRNGState(sRNG, pRNG, header, rng); - NerscIO::readConfiguration(U, header, config); + NerscIO::readConfiguration(U, header, config); }; }; diff --git a/Grid/qcd/hmc/integrators/Integrator.h b/Grid/qcd/hmc/integrators/Integrator.h index d5475704..aa28c6c8 100644 --- a/Grid/qcd/hmc/integrators/Integrator.h +++ b/Grid/qcd/hmc/integrators/Integrator.h @@ -33,6 +33,7 @@ directory #define INTEGRATOR_INCLUDED #include +#include "MomentumFilter.h" NAMESPACE_BEGIN(Grid); @@ -78,8 +79,19 @@ protected: RepresentationPolicy Representations; IntegratorParameters Params; + //Filters allow the user to manipulate the conjugate momentum, for example to freeze links in DDHMC + //It is applied whenever the momentum is updated / refreshed + //The default filter does nothing + MomentumFilterBase const* MomFilter; + const ActionSet as; + //Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default + static MomentumFilterBase const* getDefaultMomFilter(){ + static MomentumFilterNone filter; + return &filter; + } + void update_P(Field& U, int level, double ep) { t_P[level] += ep; @@ -135,6 +147,8 @@ protected: // Force from the other representations as[level].apply(update_P_hireps, Representations, Mom, U, ep); + + MomFilter->applyFilter(Mom); } void update_U(Field& U, double ep) @@ -174,11 +188,23 @@ public: t_P.resize(levels, 0.0); t_U = 0.0; // initialization of smearer delegated outside of Integrator + + //Default the momentum filter to "do-nothing" + MomFilter = getDefaultMomFilter(); }; virtual ~Integrator() {} virtual std::string integrator_name() = 0; + + //Set the momentum filter allowing for manipulation of the conjugate momentum + void setMomentumFilter(const MomentumFilterBase &filter){ + MomFilter = &filter; + } + + //Access the conjugate momentum + const MomentaField & getMomentum() const{ return P; } + void print_parameters() { @@ -210,10 +236,9 @@ public: // over the representations struct _refresh { template - void operator()(std::vector*> repr_set, Repr& Rep, - GridParallelRNG& pRNG) { + void operator()(std::vector*> repr_set, Repr& Rep, GridSerialRNG & sRNG, GridParallelRNG& pRNG) { for (int a = 0; a < repr_set.size(); ++a){ - repr_set.at(a)->refresh(Rep.U, pRNG); + repr_set.at(a)->refresh(Rep.U, sRNG, pRNG); std::cout << GridLogDebug << "Hirep refreshing pseudofermions" << std::endl; } @@ -221,12 +246,12 @@ public: } refresh_hireps{}; // Initialization of momenta and actions - void refresh(Field& U, GridParallelRNG& pRNG) + void refresh(Field& U, GridSerialRNG & sRNG, GridParallelRNG& pRNG) { assert(P.Grid() == U.Grid()); std::cout << GridLogIntegrator << "Integrator refresh\n"; - FieldImplementation::generate_momenta(P, pRNG); + FieldImplementation::generate_momenta(P, sRNG, pRNG); // Update the smeared fields, can be implemented as observer // necessary to keep the fields updated even after a reject @@ -243,12 +268,14 @@ public: // get gauge field from the SmearingPolicy and // based on the boolean is_smeared in actionID Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); - as[level].actions.at(actionID)->refresh(Us, pRNG); + as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); } // Refresh the higher representation actions - as[level].apply(refresh_hireps, Representations, pRNG); + as[level].apply(refresh_hireps, Representations, sRNG, pRNG); } + + MomFilter->applyFilter(P); } // to be used by the actionlevel class to iterate @@ -313,6 +340,8 @@ public: std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; } + FieldImplementation::Project(U); + // and that we indeed got to the end of the trajectory assert(fabs(t_U - Params.trajL) < 1.0e-6); diff --git a/Grid/qcd/hmc/integrators/MomentumFilter.h b/Grid/qcd/hmc/integrators/MomentumFilter.h new file mode 100644 index 00000000..2a15d80c --- /dev/null +++ b/Grid/qcd/hmc/integrators/MomentumFilter.h @@ -0,0 +1,94 @@ +/************************************************************************************* + +Grid physics library, www.github.com/paboyle/Grid + +Source file: ./lib/qcd/hmc/integrators/MomentumFilter.h + +Copyright (C) 2015 + +Author: Christopher Kelly +Author: Peter Boyle + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +See the full license in the file "LICENSE" in the top level distribution +directory +*************************************************************************************/ +/* END LEGAL */ +//-------------------------------------------------------------------- +#ifndef MOMENTUM_FILTER +#define MOMENTUM_FILTER + +NAMESPACE_BEGIN(Grid); + +//These filter objects allow the user to manipulate the conjugate momentum as part of the update / refresh + +template +struct MomentumFilterBase{ + virtual void applyFilter(MomentaField &P) const; +}; + +//Do nothing +template +struct MomentumFilterNone: public MomentumFilterBase{ + void applyFilter(MomentaField &P) const override{} +}; + +//Multiply each site/direction by a Lorentz vector complex number field +//Can be used to implement a mask, zeroing out sites +template +struct MomentumFilterApplyPhase: public MomentumFilterBase{ + typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type + typedef typename MomentaField::scalar_type scalar_type; //scalar complex type + typedef iVector >, Nd > LorentzScalarType; //complex phase for each site/direction + typedef Lattice LatticeLorentzScalarType; + + LatticeLorentzScalarType phase; + + MomentumFilterApplyPhase(const LatticeLorentzScalarType _phase): phase(_phase){} + + //Default to uniform field of (1,0) + MomentumFilterApplyPhase(GridBase* _grid): phase(_grid){ + LorentzScalarType one; + for(int mu=0;musmear(C, U); for (int mu = 0; mu < Nd; mu++) { - if( mu == OrthogDim ) - tmp = 1.0; // Don't smear in the orthogonal direction - else { - tmp = peekLorentz(C, mu); - Umu = peekLorentz(U, mu); - iq_mu = Ta( - tmp * - adj(Umu)); // iq_mu = Ta(Omega_mu) to match the signs with the paper - exponentiate_iQ(tmp, iq_mu); - } - pokeLorentz(u_smr, tmp * Umu, mu); // u_smr = exp(iQ_mu)*U_mu + if( mu == OrthogDim ) continue ; + // u_smr = exp(iQ_mu)*U_mu apart from Orthogdim + Umu = peekLorentz(U, mu); + tmp = peekLorentz(C, mu); + iq_mu = Ta( tmp * adj(Umu)); + exponentiate_iQ(tmp, iq_mu); + pokeLorentz(u_smr, tmp * Umu, mu); } std::cout << GridLogDebug << "Stout smearing completed\n"; }; diff --git a/Grid/qcd/spin/TwoSpinor.h b/Grid/qcd/spin/TwoSpinor.h index 924594ab..8dad0cd0 100644 --- a/Grid/qcd/spin/TwoSpinor.h +++ b/Grid/qcd/spin/TwoSpinor.h @@ -128,7 +128,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spProjTm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; hspin(0)=fspin(0)-fspin(2); hspin(1)=fspin(1)-fspin(3); } @@ -138,40 +137,50 @@ template > = 0> accelerator_inline void s * 0 0 -1 0 * 0 0 0 -1 */ - template > = 0> accelerator_inline void spProj5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; hspin(0)=fspin(0); hspin(1)=fspin(1); } template > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; hspin(0)=fspin(2); hspin(1)=fspin(3); } -// template accelerator_inline void fspProj5p (iVector &rfspin,const iVector &fspin) template > = 0> accelerator_inline void spProj5p (iVector &rfspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; rfspin(0)=fspin(0); rfspin(1)=fspin(1); rfspin(2)=Zero(); rfspin(3)=Zero(); } -// template accelerator_inline void fspProj5m (iVector &rfspin,const iVector &fspin) template > = 0> accelerator_inline void spProj5m (iVector &rfspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; rfspin(0)=Zero(); rfspin(1)=Zero(); rfspin(2)=fspin(2); rfspin(3)=fspin(3); } +template > = 0> accelerator_inline void spProj5p (iVector &rfspin,const iVector &fspin) +{ + const int hN = N>>1; + for(int s=0;s > = 0> accelerator_inline void spProj5m (iVector &rfspin,const iVector &fspin) +{ + const int hN = N>>1; + for(int s=0;s > = 0> accelerator_inline void s */ template > = 0> accelerator_inline void spReconXp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=timesMinusI(hspin(1)); @@ -191,7 +199,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spReconXm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=timesI(hspin(1)); @@ -199,7 +206,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumReconXp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)-=timesI(hspin(1)); @@ -207,7 +213,6 @@ template > = 0> accelerator_inline void a } template > = 0> accelerator_inline void accumReconXm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)+=timesI(hspin(1)); @@ -221,7 +226,6 @@ template > = 0> accelerator_inline void a template > = 0> accelerator_inline void spReconYp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)= hspin(1); @@ -229,7 +233,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spReconYm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=-hspin(1); @@ -237,7 +240,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumReconYp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)+=hspin(1); @@ -245,7 +247,6 @@ template > = 0> accelerator_inline void a } template > = 0> accelerator_inline void accumReconYm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)-=hspin(1); @@ -260,7 +261,6 @@ template > = 0> accelerator_inline void a */ template > = 0> accelerator_inline void spReconZp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=timesMinusI(hspin(0)); @@ -268,7 +268,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spReconZm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)= timesI(hspin(0)); @@ -276,7 +275,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumReconZp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)-=timesI(hspin(0)); @@ -284,7 +282,6 @@ template > = 0> accelerator_inline void a } template > = 0> accelerator_inline void accumReconZm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)+=timesI(hspin(0)); @@ -298,7 +295,6 @@ template > = 0> accelerator_inline void a */ template > = 0> accelerator_inline void spReconTp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=hspin(0); @@ -306,7 +302,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spReconTm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0); fspin(1)=hspin(1); fspin(2)=-hspin(0); @@ -314,7 +309,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumReconTp (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)+=hspin(0); @@ -322,7 +316,6 @@ template > = 0> accelerator_inline void a } template > = 0> accelerator_inline void accumReconTm (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0); fspin(1)+=hspin(1); fspin(2)-=hspin(0); @@ -336,7 +329,6 @@ template > = 0> accelerator_inline void a */ template > = 0> accelerator_inline void spRecon5p (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though fspin(2)=Zero(); @@ -344,7 +336,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void spRecon5m (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)=Zero(); fspin(1)=Zero(); fspin(2)=hspin(0)+hspin(0); @@ -352,7 +343,6 @@ template > = 0> accelerator_inline void s } template > = 0> accelerator_inline void accumRecon5p (iVector &fspin,const iVector &hspin) { - //typename std::enable_if,SpinorIndex>::value,iVector >::type *SFINAE; fspin(0)+=hspin(0)+hspin(0); fspin(1)+=hspin(1)+hspin(1); } @@ -372,7 +362,6 @@ template > = 0> accelerator_inline void a ////////// template > = 0> accelerator_inline void spProjXp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconXp (iM }} } - - //////// // Xm //////// template accelerator_inline void spProjXm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjXm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjXm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjXm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjXm (iMatri template accelerator_inline void spReconXm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconXm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconXm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconXm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconXm (iMatr template accelerator_inline void accumReconXm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconXm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconXm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconXm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjYp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjYp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjYp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjYp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjYp (iMatri template accelerator_inline void spReconYp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconYp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconYp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconYp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconYp (iMatr template accelerator_inline void accumReconYp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconYp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconYp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconYp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjYm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjYm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjYm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjYm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconYm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconYm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconYm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,const iVector >::type *temp; for(int i=0;i accelerator_inline void spReconYm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconYm (iMatr template accelerator_inline void accumReconYm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconYm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconYm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconYm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconYm (iM //////// template accelerator_inline void spProjZp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjZp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjZp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjZp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconZp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconZp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconZp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconZp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconZp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconZp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconZp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconZp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconZp (iM //////// template accelerator_inline void spProjZm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjZm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjZm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjZm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconZm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconZm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconZm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconZm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconZm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconZm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconZm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconZm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumReconZm (iM //////// template accelerator_inline void spProjTp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjTp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjTp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjTp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconTp (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconTp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconTp (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconTp (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconTp (iMatr template accelerator_inline void accumReconTp (iScalar &hspin, iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconTp(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconTp (iVector &hspin, const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconTp (iMatrix &hspin, const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjTm (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProjTm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProjTm (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProjTm (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProjTm (iMatri template accelerator_inline void spReconTm (iScalar &hspin, const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spReconTm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spReconTm (iVector &hspin, const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spReconTm (iMatrix &hspin, const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spReconTm (iMatr template accelerator_inline void accumReconTm (iScalar &hspin, const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumReconTm(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumReconTm (iVector &hspin, const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumReconTm (iMatrix &hspin, const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProj5p (iScalar &hspin,const iScalar &fspin) +template > = 0> accelerator_inline void spProj5p (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProj5p(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spProj5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) +template > = 0> accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProj5p (iMatri template accelerator_inline void spRecon5p (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spRecon5p(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spRecon5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spRecon5p (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spRecon5p (iMatr template accelerator_inline void accumRecon5p (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumRecon5p(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumRecon5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumRecon5p (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumRecon5p (iM } // four spinor projectors for chiral proj -// template accelerator_inline void fspProj5p (iScalar &hspin,const iScalar &fspin) -template accelerator_inline void spProj5p (iScalar &hspin,const iScalar &fspin) +template > = 0> accelerator_inline void spProj5p (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProj5p(hspin._internal,fspin._internal); } -// template accelerator_inline void fspProj5p (iVector &hspin,iVector &fspin) -template > = 0> accelerator_inline void spProj5p (iVector &hspin,const iVector &fspin) +template > = 0,IfNotCoarsened > = 0> accelerator_inline void spProj5p (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void fspProj5p (iMatrix &hspin,iMatrix &fspin) -template accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) +template > = 0> accelerator_inline void spProj5p (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void spProj5p (iMatrix & // 5m //////// -template accelerator_inline void spProj5m (iScalar &hspin,const iScalar &fspin) +template > = 0> accelerator_inline void spProj5m (iScalar &hspin,const iScalar &fspin) { spProj5m(hspin._internal,fspin._internal); } -template > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) +template > = 0,IfNotCoarsened > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) { for(int i=0;i accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) +template > = 0> accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) { for(int i=0;i accelerator_inline void spProj5m (iMatri template accelerator_inline void spRecon5m (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spRecon5m(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void spRecon5m (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void spRecon5m (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumRecon5m (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; accumRecon5m(hspin._internal,fspin._internal); } template > = 0> accelerator_inline void accumRecon5m (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void accumRecon5m (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i accelerator_inline void accumRecon5m (iM // four spinor projectors for chiral proj -// template accelerator_inline void fspProj5m (iScalar &hspin,const iScalar &fspin) -template accelerator_inline void spProj5m (iScalar &hspin,const iScalar &fspin) +template > = 0> accelerator_inline void spProj5m (iScalar &hspin,const iScalar &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iScalar >::type *temp; spProj5m(hspin._internal,fspin._internal); } -// template accelerator_inline void fspProj5m (iVector &hspin,iVector &fspin) -template > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) +template > = 0,IfNotCoarsened > = 0> accelerator_inline void spProj5m (iVector &hspin,const iVector &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iVector >::type *temp; for(int i=0;i accelerator_inline void fspProj5m (iMatrix &hspin,iMatrix &fspin) -template accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) +template > = 0> accelerator_inline void spProj5m (iMatrix &hspin,const iMatrix &fspin) { - //typename std::enable_if,SpinorIndex>::notvalue,iMatrix >::type *temp; for(int i=0;i #include NAMESPACE_BEGIN(Grid); @@ -40,293 +39,467 @@ public: typedef typename FImpl::FermionField FermionField; typedef typename FImpl::PropagatorField PropagatorField; - typedef typename FImpl::SitePropagator pobj; - typedef typename ComplexField::vector_object vobj; - typedef Lattice> SpinMatrixField; - typedef typename SpinMatrixField::vector_object sobj; - - static const int epsilon[6][3] ; - static const Real epsilon_sgn[6]; private: - template - static void baryon_site(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const int parity, - const bool * wick_contractions, - robj &result); + template accelerator_inline + static void BaryonSite(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int parity, + const int wick_contractions, + robj &result); + template accelerator_inline + static void BaryonSiteMatrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int wick_contractions, + robj &result); public: - static void Wick_Contractions(std::string qi, + static void WickContractions(std::string qi, std::string qf, - bool* wick_contractions); + int &wick_contractions); static void ContractBaryons(const PropagatorField &q1_left, - const PropagatorField &q2_left, - const PropagatorField &q3_left, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const bool* wick_contractions, - const int parity, - ComplexField &baryon_corr); + const PropagatorField &q2_left, + const PropagatorField &q3_left, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int wick_contractions, + const int parity, + ComplexField &baryon_corr); + static void ContractBaryonsMatrix(const PropagatorField &q1_left, + const PropagatorField &q2_left, + const PropagatorField &q3_left, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int wick_contractions, + SpinMatrixField &baryon_corr); template - static void ContractBaryons_Sliced(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const bool* wick_contractions, - const int parity, - const int nt, - robj &result); + static void ContractBaryonsSliced(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int wick_contractions, + const int parity, + const int nt, + robj &result); + template + static void ContractBaryonsSlicedMatrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int wick_contractions, + const int nt, + robj &result); private: - template - static void Baryon_Gamma_3pt_Group1_Site( + template accelerator_inline + static void BaryonGamma3ptGroup1Site( const mobj &Dq1_ti, const mobj2 &Dq2_spec, const mobj2 &Dq3_spec, const mobj &Dq4_tf, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, int wick_contraction, robj &result); - template - static void Baryon_Gamma_3pt_Group2_Site( + template accelerator_inline + static void BaryonGamma3ptGroup2Site( const mobj2 &Dq1_spec, const mobj &Dq2_ti, const mobj2 &Dq3_spec, const mobj &Dq4_tf, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, int wick_contraction, robj &result); - template - static void Baryon_Gamma_3pt_Group3_Site( + template accelerator_inline + static void BaryonGamma3ptGroup3Site( const mobj2 &Dq1_spec, const mobj2 &Dq2_spec, const mobj &Dq3_ti, const mobj &Dq4_tf, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, int wick_contraction, robj &result); public: template - static void Baryon_Gamma_3pt( + static void BaryonGamma3pt( const PropagatorField &q_ti, const mobj &Dq_spec1, const mobj &Dq_spec2, const PropagatorField &q_tf, int group, int wick_contraction, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, SpinMatrixField &stn_corr); private: - template - static void Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result); - template - static void Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti, - const mobj &Du_tf, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result); + template accelerator_inline + static void SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); + template accelerator_inline + static void SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, + const mobj &Du_tf, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); - template - static void Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result); - template - static void Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti, - const mobj &Du_tf, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result); + template accelerator_inline + static void SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); + template accelerator_inline + static void SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, + const mobj &Du_tf, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); + template accelerator_inline + static void XiToSigmaQ1EyeSite(const mobj &Dq_loop, + const mobj2 &Dd_spec, + const mobj2 &Ds_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); + template accelerator_inline + static void XiToSigmaQ2EyeSite(const mobj &Dq_loop, + const mobj2 &Dd_spec, + const mobj2 &Ds_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result); public: template - static void Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, - const mobj &Du_spec, - const PropagatorField &qd_tf, - const PropagatorField &qs_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - const std::string op, - SpinMatrixField &stn_corr); + static void SigmaToNucleonEye(const PropagatorField &qq_loop, + const mobj &Du_spec, + const PropagatorField &qd_tf, + const PropagatorField &qs_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + const std::string op, + SpinMatrixField &stn_corr); template - static void Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, - const PropagatorField &qq_tf, - const mobj &Du_spec, - const PropagatorField &qd_tf, - const PropagatorField &qs_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - const std::string op, - SpinMatrixField &stn_corr); + static void SigmaToNucleonNonEye(const PropagatorField &qq_ti, + const PropagatorField &qq_tf, + const mobj &Du_spec, + const PropagatorField &qd_tf, + const PropagatorField &qs_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + const std::string op, + SpinMatrixField &stn_corr); + template + static void XiToSigmaEye(const PropagatorField &qq_loop, + const mobj &Dd_spec, + const mobj &Ds_spec, + const PropagatorField &qd_tf, + const PropagatorField &qs_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + const std::string op, + SpinMatrixField &xts_corr); }; - -template -const int BaryonUtils::epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; -/*template -const Complex BaryonUtils::epsilon_sgn[6] = {Complex(1), - Complex(1), - Complex(1), - Complex(-1), - Complex(-1), - Complex(-1)}; -*/ -template -const Real BaryonUtils::epsilon_sgn[6] = {1.,1.,1.,-1.,-1.,-1.}; - -//This is the old version +//This computes a baryon contraction on a lattice site, including the spin-trace of the correlation matrix template -template -void BaryonUtils::baryon_site(const mobj &D1, +template accelerator_inline +void BaryonUtils::BaryonSite(const mobj &D1, const mobj &D2, const mobj &D3, - const Gamma GammaA_i, - const Gamma GammaB_i, - const Gamma GammaA_f, - const Gamma GammaB_f, + const Gamma GammaA_i, + const Gamma GammaB_i, + const Gamma GammaA_f, + const Gamma GammaB_f, const int parity, - const bool * wick_contraction, + const int wick_contraction, robj &result) { - Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) + Gamma g4(Gamma::Algebra::GammaT); //needed for parity P_\pm = 0.5*(1 \pm \gamma_4) - auto D1_GAi = D1 * GammaA_i; - auto D1_GAi_g4 = D1_GAi * g4; - auto D1_GAi_P = 0.5*(D1_GAi + (Real)parity * D1_GAi_g4); - auto GAf_D1_GAi_P = GammaA_f * D1_GAi_P; - auto GBf_D1_GAi_P = GammaB_f * D1_GAi_P; + auto D1_GAi = D1 * GammaA_i; + auto D1_GAi_g4 = D1_GAi * g4; + auto D1_GAi_P = 0.5*(D1_GAi + (Real)parity * D1_GAi_g4); + auto GAf_D1_GAi_P = GammaA_f * D1_GAi_P; + auto GBf_D1_GAi_P = GammaB_f * D1_GAi_P; - auto D2_GBi = D2 * GammaB_i; - auto GBf_D2_GBi = GammaB_f * D2_GBi; - auto GAf_D2_GBi = GammaA_f * D2_GBi; + auto D2_GBi = D2 * GammaB_i; + auto GBf_D2_GBi = GammaB_f * D2_GBi; + auto GAf_D2_GBi = GammaA_f * D2_GBi; - auto GBf_D3 = GammaB_f * D3; - auto GAf_D3 = GammaA_f * D3; + auto GBf_D3 = GammaB_f * D3; + auto GAf_D3 = GammaA_f * D3; - for (int ie_f=0; ie_f < 6 ; ie_f++){ - int a_f = epsilon[ie_f][0]; //a - int b_f = epsilon[ie_f][1]; //b - int c_f = epsilon[ie_f][2]; //c + Real ee; + + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = (ie_f < 3 ? ie_f : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a + int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b + int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c + int eSgn_f = (ie_f < 3 ? 1 : -1); for (int ie_i=0; ie_i < 6 ; ie_i++){ - int a_i = epsilon[ie_i][0]; //a' - int b_i = epsilon[ie_i][1]; //b' - int c_i = epsilon[ie_i][2]; //c' + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); - Real ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; - //This is the \delta_{456}^{123} part - if (wick_contraction[0]){ - for (int rho=0; rho +template accelerator_inline +void BaryonUtils::BaryonSiteMatrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_i, + const Gamma GammaB_i, + const Gamma GammaA_f, + const Gamma GammaB_f, + const int wick_contraction, + robj &result) +{ + + auto D1_GAi = D1 * GammaA_i; + auto GAf_D1_GAi = GammaA_f * D1_GAi; + auto GBf_D1_GAi = GammaB_f * D1_GAi; + + auto D2_GBi = D2 * GammaB_i; + auto GBf_D2_GBi = GammaB_f * D2_GBi; + auto GAf_D2_GBi = GammaA_f * D2_GBi; + + auto GBf_D3 = GammaB_f * D3; + auto GAf_D3 = GammaA_f * D3; + + Real ee; + + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = (ie_f < 3 ? ie_f : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a + int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b + int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c + int eSgn_f = (ie_f < 3 ? 1 : -1); + for (int ie_i=0; ie_i < 6 ; ie_i++){ + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); + + ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; + //This is the \delta_{456}^{123} part + if (wick_contraction & 1){ + for (int rho_i=0; rho_i::baryon_site(const mobj &D1, * flavours. * * The array wick_contractions must be of length 6 */ template -void BaryonUtils::Wick_Contractions(std::string qi, std::string qf, bool* wick_contractions) { +void BaryonUtils::WickContractions(std::string qi, std::string qf, int &wick_contractions) { + assert(qi.size() == 3 && qf.size() == 3 && "Only sets of 3 quarks accepted."); const int epsilon[6][3] = {{0,1,2},{1,2,0},{2,0,1},{0,2,1},{2,1,0},{1,0,2}}; + wick_contractions=0; for (int ie=0; ie < 6 ; ie++) { - wick_contractions[ie] = (qi.size() == 3 && qf.size() == 3 - && qi[0] == qf[epsilon[ie][0]] + wick_contractions += ( ( qi[0] == qf[epsilon[ie][0]] && qi[1] == qf[epsilon[ie][1]] - && qi[2] == qf[epsilon[ie][2]]); + && qi[2] == qf[epsilon[ie][2]]) ? 1 : 0) << ie; } } @@ -351,60 +525,87 @@ void BaryonUtils::Wick_Contractions(std::string qi, std::string qf, bool* * Wick_Contractions function above */ template void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, - const PropagatorField &q2_left, - const PropagatorField &q3_left, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const bool* wick_contractions, - const int parity, - ComplexField &baryon_corr) + const PropagatorField &q2_left, + const PropagatorField &q3_left, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int wick_contractions, + const int parity, + ComplexField &baryon_corr) { assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); GridBase *grid = q1_left.Grid(); - autoView(vbaryon_corr, baryon_corr,CpuWrite); - autoView( v1 , q1_left, CpuRead); - autoView( v2 , q2_left, CpuRead); - autoView( v3 , q3_left, CpuRead); + autoView(vbaryon_corr , baryon_corr , AcceleratorWrite); + autoView( v1 , q1_left , AcceleratorRead); + autoView( v2 , q2_left , AcceleratorRead); + autoView( v3 , q3_left , AcceleratorRead); Real bytes =0.; bytes += grid->oSites() * (432.*sizeof(vComplex) + 126.*sizeof(int) + 36.*sizeof(Real)); for (int ie=0; ie < 6 ; ie++){ if(ie==0 or ie==3){ - bytes += grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) * wick_contractions[ie]; - } - else{ - bytes += grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) * wick_contractions[ie]; + bytes += ( wick_contractions & (1 << ie) ) ? grid->oSites() * (4.*sizeof(int) + 4752.*sizeof(vComplex)) : 0.; + } else{ + bytes += ( wick_contractions & (1 << ie) ) ? grid->oSites() * (64.*sizeof(int) + 5184.*sizeof(vComplex)) : 0.; } } Real t=0.; t =-usecond(); accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto D1 = v1[ss]; - auto D2 = v2[ss]; - auto D3 = v3[ss]; - vobj result=Zero(); - baryon_site(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result); - vbaryon_corr[ss] = result; + auto D1 = v1(ss); + auto D2 = v2(ss); + auto D3 = v3(ss); + typedef decltype(coalescedRead(vbaryon_corr[0])) cVec; + cVec result=Zero(); + BaryonSite(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,parity,wick_contractions,result); + coalescedWrite(vbaryon_corr[ss],result); } );//end loop over lattice sites t += usecond(); - std::cout << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; + std::cout << GridLogDebug << std::setw(10) << bytes/t*1.0e6/1024/1024/1024 << " GB/s " << std::endl; +} +template +void BaryonUtils::ContractBaryonsMatrix(const PropagatorField &q1_left, + const PropagatorField &q2_left, + const PropagatorField &q3_left, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int wick_contractions, + SpinMatrixField &baryon_corr) +{ + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + GridBase *grid = q1_left.Grid(); + + autoView(vbaryon_corr , baryon_corr , AcceleratorWrite); + autoView( v1 , q1_left , AcceleratorRead); + autoView( v2 , q2_left , AcceleratorRead); + autoView( v3 , q3_left , AcceleratorRead); + + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto D1 = v1(ss); + auto D2 = v2(ss); + auto D3 = v3(ss); + typedef decltype(coalescedRead(vbaryon_corr[0])) spinor; + spinor result=Zero(); + BaryonSiteMatrix(D1,D2,D3,GammaA_left,GammaB_left,GammaA_right,GammaB_right,wick_contractions,result); + coalescedWrite(vbaryon_corr[ss],result); + } );//end loop over lattice sites } /* The array wick_contractions must be of length 6. The order * @@ -414,31 +615,48 @@ void BaryonUtils::ContractBaryons(const PropagatorField &q1_left, * Wick_Contractions function above */ template template -void BaryonUtils::ContractBaryons_Sliced(const mobj &D1, - const mobj &D2, - const mobj &D3, - const Gamma GammaA_left, - const Gamma GammaB_left, - const Gamma GammaA_right, - const Gamma GammaB_right, - const bool* wick_contractions, - const int parity, - const int nt, - robj &result) +void BaryonUtils::ContractBaryonsSliced(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int wick_contractions, + const int parity, + const int nt, + robj &result) +{ + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); + + for (int t=0; t +template +void BaryonUtils::ContractBaryonsSlicedMatrix(const mobj &D1, + const mobj &D2, + const mobj &D3, + const Gamma GammaA_left, + const Gamma GammaB_left, + const Gamma GammaA_right, + const Gamma GammaB_right, + const int wick_contractions, + const int nt, + robj &result) { assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - std::cout << "GammaA (left) " << (GammaA_left.g) << std::endl; - std::cout << "GammaB (left) " << (GammaB_left.g) << std::endl; - std::cout << "GammaA (right) " << (GammaA_right.g) << std::endl; - std::cout << "GammaB (right) " << (GammaB_right.g) << std::endl; - - assert(parity==1 || parity == -1 && "Parity must be +1 or -1"); - for (int t=0; t::ContractBaryons_Sliced(const mobj &D1, * Dq3_spec is a quark line from t_i to t_f * Dq4_tf is a quark line from t_f to t_J */ template -template -void BaryonUtils::Baryon_Gamma_3pt_Group1_Site( +template accelerator_inline +void BaryonUtils::BaryonGamma3ptGroup1Site( const mobj &Dq1_ti, const mobj2 &Dq2_spec, const mobj2 &Dq3_spec, const mobj &Dq4_tf, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, int wick_contraction, robj &result) { - Gamma g5(Gamma::Algebra::Gamma5); + Gamma g5(Gamma::Algebra::Gamma5); - auto adjD4_g_D1 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq1_ti; - auto Gf_adjD4_g_D1 = GammaBf * adjD4_g_D1; - auto D2_Gi = Dq2_spec * GammaBi; - auto Gf_D2_Gi = GammaBf * D2_Gi; - auto Gf_D3 = GammaBf * Dq3_spec; + auto adjD4 = g5 * adj(Dq4_tf) * g5 ; + auto adjD4_g_D1 = adjD4 * GammaJ * Dq1_ti; + auto Gf_adjD4_g_D1 = GammaBf * adjD4_g_D1; + auto D2_Gi = Dq2_spec * GammaBi; + auto Gf_D2_Gi = GammaBf * D2_Gi; + auto Gf_D3 = GammaBf * Dq3_spec; - int a_f, b_f, c_f; - int a_i, b_i, c_i; + Real ee; - Real ee; - - for (int ie_f=0; ie_f < 6 ; ie_f++){ - a_f = epsilon[ie_f][0]; //a - b_f = epsilon[ie_f][1]; //b - c_f = epsilon[ie_f][2]; //c + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = (ie_f < 3 ? ie_f : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a + int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b + int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c + int eSgn_f = (ie_f < 3 ? 1 : -1); for (int ie_i=0; ie_i < 6 ; ie_i++){ - a_i = epsilon[ie_i][0]; //a' - b_i = epsilon[ie_i][1]; //b' - c_i = epsilon[ie_i][2]; //c' + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); - ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + ee = Real(eSgn_f * eSgn_i); - for (int alpha_f=0; alpha_f::Baryon_Gamma_3pt_Group1_Site( * Dq3_spec is a quark line from t_i to t_f * Dq4_tf is a quark line from t_f to t_J */ template -template -void BaryonUtils::Baryon_Gamma_3pt_Group2_Site( +template accelerator_inline +void BaryonUtils::BaryonGamma3ptGroup2Site( const mobj2 &Dq1_spec, const mobj &Dq2_ti, const mobj2 &Dq3_spec, const mobj &Dq4_tf, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, int wick_contraction, robj &result) { - Gamma g5(Gamma::Algebra::Gamma5); + Gamma g5(Gamma::Algebra::Gamma5); - auto adjD4_g_D2_Gi = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq2_ti * GammaBi; - auto Gf_adjD4_g_D2_Gi = GammaBf * adjD4_g_D2_Gi; - auto Gf_D1 = GammaBf * Dq1_spec; - auto Gf_D3 = GammaBf * Dq3_spec; + auto adjD4_g_D2_Gi = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq2_ti * GammaBi; + auto Gf_adjD4_g_D2_Gi = GammaBf * adjD4_g_D2_Gi; + auto Gf_D1 = GammaBf * Dq1_spec; + auto Gf_D3 = GammaBf * Dq3_spec; - int a_f, b_f, c_f; - int a_i, b_i, c_i; + Real ee; - Real ee; - - for (int ie_f=0; ie_f < 6 ; ie_f++){ - a_f = epsilon[ie_f][0]; //a - b_f = epsilon[ie_f][1]; //b - c_f = epsilon[ie_f][2]; //c + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = (ie_f < 3 ? ie_f : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a + int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b + int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c + int eSgn_f = (ie_f < 3 ? 1 : -1); for (int ie_i=0; ie_i < 6 ; ie_i++){ - a_i = epsilon[ie_i][0]; //a' - b_i = epsilon[ie_i][1]; //b' - c_i = epsilon[ie_i][2]; //c' + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); - ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; - for (int alpha_f=0; alpha_f::Baryon_Gamma_3pt_Group2_Site( * Dq3_ti is a quark line from t_i to t_J * Dq4_tf is a quark line from t_f to t_J */ template -template -void BaryonUtils::Baryon_Gamma_3pt_Group3_Site( +template accelerator_inline +void BaryonUtils::BaryonGamma3ptGroup3Site( const mobj2 &Dq1_spec, const mobj2 &Dq2_spec, const mobj &Dq3_ti, const mobj &Dq4_tf, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, int wick_contraction, robj &result) { - Gamma g5(Gamma::Algebra::Gamma5); + Gamma g5(Gamma::Algebra::Gamma5); - auto adjD4_g_D3 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq3_ti; - auto Gf_adjD4_g_D3 = GammaBf * adjD4_g_D3; - auto Gf_D1 = GammaBf * Dq1_spec; - auto D2_Gi = Dq2_spec * GammaBi; - auto Gf_D2_Gi = GammaBf * D2_Gi; + auto adjD4_g_D3 = g5 * adj(Dq4_tf) * g5 * GammaJ * Dq3_ti; + auto Gf_adjD4_g_D3 = GammaBf * adjD4_g_D3; + auto Gf_D1 = GammaBf * Dq1_spec; + auto D2_Gi = Dq2_spec * GammaBi; + auto Gf_D2_Gi = GammaBf * D2_Gi; - int a_f, b_f, c_f; - int a_i, b_i, c_i; + Real ee; - Real ee; - - for (int ie_f=0; ie_f < 6 ; ie_f++){ - a_f = epsilon[ie_f][0]; //a - b_f = epsilon[ie_f][1]; //b - c_f = epsilon[ie_f][2]; //c + for (int ie_f=0; ie_f < 6 ; ie_f++){ + int a_f = (ie_f < 3 ? ie_f : (6-ie_f)%3 ); //epsilon[ie_n][0]; //a + int b_f = (ie_f < 3 ? (ie_f+1)%3 : (8-ie_f)%3 ); //epsilon[ie_n][1]; //b + int c_f = (ie_f < 3 ? (ie_f+2)%3 : (7-ie_f)%3 ); //epsilon[ie_n][2]; //c + int eSgn_f = (ie_f < 3 ? 1 : -1); for (int ie_i=0; ie_i < 6 ; ie_i++){ - a_i = epsilon[ie_i][0]; //a' - b_i = epsilon[ie_i][1]; //b' - c_i = epsilon[ie_i][2]; //c' + int a_i = (ie_i < 3 ? ie_i : (6-ie_i)%3 ); //epsilon[ie_s][0]; //a' + int b_i = (ie_i < 3 ? (ie_i+1)%3 : (8-ie_i)%3 ); //epsilon[ie_s][1]; //b' + int c_i = (ie_i < 3 ? (ie_i+2)%3 : (7-ie_i)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_i = (ie_i < 3 ? 1 : -1); - ee = epsilon_sgn[ie_f] * epsilon_sgn[ie_i]; + ee = Real(eSgn_f * eSgn_i); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; - for (int alpha_f=0; alpha_f::Baryon_Gamma_3pt_Group3_Site( * https://aportelli.github.io/Hadrons-doc/#/mcontraction */ template template -void BaryonUtils::Baryon_Gamma_3pt( +void BaryonUtils::BaryonGamma3pt( const PropagatorField &q_ti, const mobj &Dq_spec1, const mobj &Dq_spec2, const PropagatorField &q_tf, int group, int wick_contraction, - const Gamma GammaJ, - const Gamma GammaBi, - const Gamma GammaBf, + const Gamma GammaJ, + const Gamma GammaBi, + const Gamma GammaBf, SpinMatrixField &stn_corr) { - GridBase *grid = q_tf.Grid(); + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - autoView( vcorr, stn_corr, CpuWrite); - autoView( vq_ti , q_ti, CpuRead); - autoView( vq_tf , q_tf, CpuRead); + GridBase *grid = q_tf.Grid(); - if (group == 1) { - accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_ti = vq_ti[ss]; - auto Dq_tf = vq_tf[ss]; - sobj result=Zero(); - Baryon_Gamma_3pt_Group1_Site(Dq_ti,Dq_spec1,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - vcorr[ss] += result; - });//end loop over lattice sites - } else if (group == 2) { - accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_ti = vq_ti[ss]; - auto Dq_tf = vq_tf[ss]; - sobj result=Zero(); - Baryon_Gamma_3pt_Group2_Site(Dq_spec1,Dq_ti,Dq_spec2,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); - vcorr[ss] += result; - });//end loop over lattice sites - } else if (group == 3) { - accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_ti = vq_ti[ss]; - auto Dq_tf = vq_tf[ss]; - sobj result=Zero(); - Baryon_Gamma_3pt_Group3_Site(Dq_spec1,Dq_spec2,Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + autoView( vcorr , stn_corr , AcceleratorWrite); + autoView( vq_ti , q_ti , AcceleratorRead); + autoView( vq_tf , q_tf , AcceleratorRead); + + Vector my_Dq_spec{Dq_spec1,Dq_spec2}; + mobj * Dq_spec_p = &my_Dq_spec[0]; + + if (group == 1) { + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti(ss); + auto Dq_tf = vq_tf(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + BaryonGamma3ptGroup1Site(Dq_ti,Dq_spec_p[0],Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); + });//end loop over lattice sites + } else if (group == 2) { + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti(ss); + auto Dq_tf = vq_tf(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + BaryonGamma3ptGroup2Site(Dq_spec_p[0],Dq_ti,Dq_spec_p[1],Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); + });//end loop over lattice sites + } else if (group == 3) { + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti(ss); + auto Dq_tf = vq_tf(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + BaryonGamma3ptGroup3Site(Dq_spec_p[0],Dq_spec_p[1],Dq_ti,Dq_tf,GammaJ,GammaBi,GammaBf,wick_contraction,result); + coalescedWrite(vcorr[ss],coalescedRead(vcorr[ss])+result); + });//end loop over lattice sites + } - vcorr[ss] += result; - });//end loop over lattice sites - } } /*********************************************************************** * End of BaryonGamma3pt-function code. * - * * + * * * The following code is for Sigma -> N rare hypeon decays * **********************************************************************/ @@ -786,46 +1017,56 @@ void BaryonUtils::Baryon_Gamma_3pt( * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ template -template -void BaryonUtils::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result) +template accelerator_inline +void BaryonUtils::SigmaToNucleonQ1EyeSite(const mobj &Dq_loop, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result) { Gamma g5(Gamma::Algebra::Gamma5); - auto DuG = Du_spec * GammaB_nucl; - // Gamma^B * Ds * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) - auto GDsGDd = GammaB_sigma * Ds_ti * Gamma_H * g5 * adj(Dd_tf) * g5; - // Dq_loop * \gamma_\mu^L - auto DqG = Dq_loop * Gamma_H; + auto adjDd_GH_Ds = g5 * adj(Dd_tf) * g5 * Gamma_H * Ds_ti; + auto Gn_adjDd_GH_Ds = GammaB_nucl * adjDd_GH_Ds; + auto Du_Gs = Du_spec * GammaB_sigma; + auto Dq_GH = Dq_loop * Gamma_H; + auto Tr_Dq_GH = trace(Dq_GH)()()(); + + Real ee; for (int ie_n=0; ie_n < 6 ; ie_n++){ - int a_n = epsilon[ie_n][0]; //a - int b_n = epsilon[ie_n][1]; //b - int c_n = epsilon[ie_n][2]; //c + int a_n = (ie_n < 3 ? ie_n : (6-ie_n)%3 ); //epsilon[ie_n][0]; //a + int b_n = (ie_n < 3 ? (ie_n+1)%3 : (8-ie_n)%3 ); //epsilon[ie_n][1]; //b + int c_n = (ie_n < 3 ? (ie_n+2)%3 : (7-ie_n)%3 ); //epsilon[ie_n][2]; //c + int eSgn_n = (ie_n < 3 ? 1 : -1); for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = epsilon[ie_s][0]; //a' - int b_s = epsilon[ie_s][1]; //b' - int c_s = epsilon[ie_s][2]; //c' - for (int alpha_s=0; alpha_s::Sigma_to_Nucleon_Q1_Eye_site(const mobj &Dq_loop, * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ template -template -void BaryonUtils::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti, - const mobj &Du_tf, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result) +template accelerator_inline +void BaryonUtils::SigmaToNucleonQ1NonEyeSite(const mobj &Du_ti, + const mobj &Du_tf, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result) { Gamma g5(Gamma::Algebra::Gamma5); - auto DuG = Du_spec * GammaB_nucl; - auto adjDu = g5 * adj(Du_tf) * g5; - auto adjDuG = adjDu * GammaB_nucl; - // Gamma^B * Ds * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) - auto GDsGDd = GammaB_sigma * Ds_ti * Gamma_H * g5 * adj(Dd_tf) * g5; - // Dq_loop * \gamma_\mu^L - auto DuGH = Du_ti * Gamma_H; + auto Du_Gs = Du_spec * GammaB_sigma; + auto adjDd_GH_Ds = g5 * adj(Dd_tf) * g5 * Gamma_H * Ds_ti; + auto Gn_adjDd_GH_Ds = GammaB_nucl * adjDd_GH_Ds; + auto adjDu_GH_Du = g5 * adj(Du_tf) * g5 * Gamma_H * Du_ti; + auto adjDu_GH_Du_Gs = adjDu_GH_Du * GammaB_sigma; + + Real ee; for (int ie_n=0; ie_n < 6 ; ie_n++){ - int a_n = epsilon[ie_n][0]; //a - int b_n = epsilon[ie_n][1]; //b - int c_n = epsilon[ie_n][2]; //c + int a_n = (ie_n < 3 ? ie_n : (6-ie_n)%3 ); //epsilon[ie_n][0]; //a + int b_n = (ie_n < 3 ? (ie_n+1)%3 : (8-ie_n)%3 ); //epsilon[ie_n][1]; //b + int c_n = (ie_n < 3 ? (ie_n+2)%3 : (7-ie_n)%3 ); //epsilon[ie_n][2]; //c + int eSgn_n = (ie_n < 3 ? 1 : -1); for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = epsilon[ie_s][0]; //a' - int b_s = epsilon[ie_s][1]; //b' - int c_s = epsilon[ie_s][2]; //c' - for (int alpha_s=0; alpha_s::Sigma_to_Nucleon_Q1_NonEye_site(const mobj &Du_ti, * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ template -template -void BaryonUtils::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop, - const mobj2 &Du_spec, - const mobj &Dd_tf, - const mobj &Ds_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - robj &result) +template accelerator_inline +void BaryonUtils::SigmaToNucleonQ2EyeSite(const mobj &Dq_loop, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result) { Gamma g5(Gamma::Algebra::Gamma5); - auto DuG = Du_spec * GammaB_nucl; - // Gamma^B * Ds * \gamma_\mu^L - auto GDsG = GammaB_sigma * Ds_ti * Gamma_H; - // Dq_loop * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) - auto DqGDd = Dq_loop * Gamma_H * g5 * adj(Dd_tf) * g5; + auto adjDd_GH_Duloop_GH_Ds = g5 * adj(Dd_tf) * g5 * Gamma_H * Dq_loop * Gamma_H * Ds_ti; + auto Gn_adjDd_GH_Duloop_GH_Ds = GammaB_nucl * adjDd_GH_Duloop_GH_Ds; + auto Du_Gs = Du_spec * GammaB_sigma; + + Real ee; for (int ie_n=0; ie_n < 6 ; ie_n++){ - int a_n = epsilon[ie_n][0]; //a - int b_n = epsilon[ie_n][1]; //b - int c_n = epsilon[ie_n][2]; //c + int a_n = (ie_n < 3 ? ie_n : (6-ie_n)%3 ); //epsilon[ie_n][0]; //a + int b_n = (ie_n < 3 ? (ie_n+1)%3 : (8-ie_n)%3 ); //epsilon[ie_n][1]; //b + int c_n = (ie_n < 3 ? (ie_n+2)%3 : (7-ie_n)%3 ); //epsilon[ie_n][2]; //c + int eSgn_n = (ie_n < 3 ? 1 : -1); for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = epsilon[ie_s][0]; //a' - int b_s = epsilon[ie_s][1]; //b' - int c_s = epsilon[ie_s][2]; //c' - for (int alpha_s=0; alpha_s::Sigma_to_Nucleon_Q2_Eye_site(const mobj &Dq_loop, * Dd_tf is a quark line from t_f to t_H * Ds_ti is a quark line from t_i to t_H */ template -template -void BaryonUtils::Sigma_to_Nucleon_Q2_NonEye_site(const mobj &Du_ti, - const mobj &Du_tf, - const mobj2 &Du_spec, +template accelerator_inline +void BaryonUtils::SigmaToNucleonQ2NonEyeSite(const mobj &Du_ti, + const mobj &Du_tf, + const mobj2 &Du_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + robj &result) +{ + + Gamma g5(Gamma::Algebra::Gamma5); + + auto Du_Gs = Du_spec * GammaB_sigma; + auto adjDu_GH_Ds = g5 * adj(Du_tf) * g5 * Gamma_H * Ds_ti; + auto adjDd_GH_Du = g5 * adj(Dd_tf) * g5 * Gamma_H * Du_ti; + auto Gn_adjDd_GH_Du = GammaB_nucl * adjDd_GH_Du; // for some reason I needed to split this into two lines to avoid the compilation error 'error: identifier "Grid::Gamma::mul" is undefined in device code' + + auto Gn_adjDd_GH_Du_Gs = Gn_adjDd_GH_Du * GammaB_sigma; + + Real ee; + + for (int ie_n=0; ie_n < 6 ; ie_n++){ + int a_n = (ie_n < 3 ? ie_n : (6-ie_n)%3 ); //epsilon[ie_n][0]; //a + int b_n = (ie_n < 3 ? (ie_n+1)%3 : (8-ie_n)%3 ); //epsilon[ie_n][1]; //b + int c_n = (ie_n < 3 ? (ie_n+2)%3 : (7-ie_n)%3 ); //epsilon[ie_n][2]; //c + int eSgn_n = (ie_n < 3 ? 1 : -1); + for (int ie_s=0; ie_s < 6 ; ie_s++){ + int a_s = (ie_s < 3 ? ie_s : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a' + int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b' + int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_s = (ie_s < 3 ? 1 : -1); + + ee = Real(eSgn_n * eSgn_s); //epsilon_sgn[ie_n] * epsilon_sgn[ie_s]; + + for (int alpha_n=0; alpha_n +template +void BaryonUtils::SigmaToNucleonEye(const PropagatorField &qq_loop, + const mobj &Du_spec, + const PropagatorField &qd_tf, + const PropagatorField &qs_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + const std::string op, + SpinMatrixField &stn_corr) +{ + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + GridBase *grid = qs_ti.Grid(); + + autoView( vcorr , stn_corr , AcceleratorWrite); + autoView( vq_loop , qq_loop , AcceleratorRead); + autoView( vd_tf , qd_tf , AcceleratorRead); + autoView( vs_ti , qs_ti , AcceleratorRead); + + Vector my_Dq_spec{Du_spec}; + mobj * Dq_spec_p = &my_Dq_spec[0]; + + if(op == "Q1"){ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_loop = vq_loop(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + SigmaToNucleonQ1EyeSite(Dq_loop,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + coalescedWrite(vcorr[ss],result); + });//end loop over lattice sites + } else if(op == "Q2"){ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_loop = vq_loop(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + SigmaToNucleonQ2EyeSite(Dq_loop,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + coalescedWrite(vcorr[ss],result); + });//end loop over lattice sites + } else { + assert(0 && "Weak Operator not correctly specified"); + } +} + +template +template +void BaryonUtils::SigmaToNucleonNonEye(const PropagatorField &qq_ti, + const PropagatorField &qq_tf, + const mobj &Du_spec, + const PropagatorField &qd_tf, + const PropagatorField &qs_ti, + const Gamma Gamma_H, + const Gamma GammaB_sigma, + const Gamma GammaB_nucl, + const std::string op, + SpinMatrixField &stn_corr) +{ + + assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); + assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); + + GridBase *grid = qs_ti.Grid(); + + autoView( vcorr , stn_corr , AcceleratorWrite ); + autoView( vq_ti , qq_ti , AcceleratorRead ); + autoView( vq_tf , qq_tf , AcceleratorRead ); + autoView( vd_tf , qd_tf , AcceleratorRead ); + autoView( vs_ti , qs_ti , AcceleratorRead ); + + Vector my_Dq_spec{Du_spec}; + mobj * Dq_spec_p = &my_Dq_spec[0]; + + if(op == "Q1"){ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti(ss); + auto Dq_tf = vq_tf(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + SigmaToNucleonQ1NonEyeSite(Dq_ti,Dq_tf,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + coalescedWrite(vcorr[ss],result); + });//end loop over lattice sites + } else if(op == "Q2"){ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_ti = vq_ti(ss); + auto Dq_tf = vq_tf(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + SigmaToNucleonQ2NonEyeSite(Dq_ti,Dq_tf,Dq_spec_p[0],Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); + coalescedWrite(vcorr[ss],result); + });//end loop over lattice sites + } else { + assert(0 && "Weak Operator not correctly specified"); + } +} + + +/*********************************************************************** + * The following code is for Xi -> Sigma rare hypeon decays * + **********************************************************************/ + +/* Dq_loop is a quark line from t_H to t_H + * Dd_spec is a quark line from t_i to t_f + * Ds_spec is a quark line from t_i to t_f + * Dd_tf is a quark line from t_f to t_H + * Ds_ti is a quark line from t_i to t_H */ +template +template accelerator_inline +void BaryonUtils::XiToSigmaQ1EyeSite(const mobj &Dq_loop, + const mobj2 &Dd_spec, + const mobj2 &Ds_spec, const mobj &Dd_tf, const mobj &Ds_ti, const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, + const Gamma GammaB_xi, + const Gamma GammaB_sigma, robj &result) { Gamma g5(Gamma::Algebra::Gamma5); - auto DuG = Du_spec * GammaB_nucl; - auto adjDu = g5 * adj(Du_tf) * g5; - auto adjDuG = adjDu * GammaB_nucl; - // Gamma^B * Ds * \gamma_\mu^L - auto GDsG = GammaB_sigma * Ds_ti * Gamma_H; - // Du * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) - auto DuGDd = Du_ti * Gamma_H * g5 * adj(Dd_tf) * g5; + auto DdG = Dd_spec * GammaB_sigma; + auto GDs = GammaB_xi * Ds_spec; + // Ds * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) + auto DsGDd = Ds_ti * Gamma_H * g5 * adj(Dd_tf) * g5; + // DsGDd * GammaB + auto DsGDdG = DsGDd * GammaB_sigma; + // GammaB * DsGDd + auto GDsGDd = GammaB_xi * DsGDd; + // GammaB * DsGDd * GammaB + auto GDsGDdG = GDsGDd * GammaB_sigma; + // \gamma_\mu^L * Dq_loop + auto trGDq = TensorRemove(trace(Gamma_H * Dq_loop)); - for (int ie_n=0; ie_n < 6 ; ie_n++){ - int a_n = epsilon[ie_n][0]; //a - int b_n = epsilon[ie_n][1]; //b - int c_n = epsilon[ie_n][2]; //c - for (int ie_s=0; ie_s < 6 ; ie_s++){ - int a_s = epsilon[ie_s][0]; //a' - int b_s = epsilon[ie_s][1]; //b' - int c_s = epsilon[ie_s][2]; //c' - for (int alpha_s=0; alpha_s +template accelerator_inline +void BaryonUtils::XiToSigmaQ2EyeSite(const mobj &Dq_loop, + const mobj2 &Dd_spec, + const mobj2 &Ds_spec, + const mobj &Dd_tf, + const mobj &Ds_ti, + const Gamma Gamma_H, + const Gamma GammaB_xi, + const Gamma GammaB_sigma, + robj &result) +{ + + Gamma g5(Gamma::Algebra::Gamma5); + + auto DdG = Dd_spec * GammaB_sigma; + auto GDs = GammaB_xi * Ds_spec; + // Ds * \gamma_\mu^L * Dq_loop * \gamma_\mu^L * (\gamma_5 * Dd^\dagger * \gamma_5) + auto DsGDqGDd = Ds_ti * Gamma_H * Dq_loop * Gamma_H * g5 * adj(Dd_tf) * g5; + // DsGDd * GammaB + auto DsGDqGDdG = DsGDqGDd * GammaB_sigma; + // GammaB * DsGDd + auto GDsGDqGDd = GammaB_xi * DsGDqGDd; + // GammaB * DsGDd * GammaB + auto GDsGDqGDdG = GDsGDqGDd * GammaB_sigma; + + Real ee; + + for (int ie_s=0; ie_s < 6 ; ie_s++){ + int a_s = (ie_s < 3 ? ie_s : (6-ie_s)%3 ); //epsilon[ie_s][0]; //a' + int b_s = (ie_s < 3 ? (ie_s+1)%3 : (8-ie_s)%3 ); //epsilon[ie_s][1]; //b' + int c_s = (ie_s < 3 ? (ie_s+2)%3 : (7-ie_s)%3 ); //epsilon[ie_s][2]; //c' + int eSgn_s = (ie_s < 3 ? 1 : -1); + for (int ie_x=0; ie_x < 6 ; ie_x++){ + int a_x = (ie_x < 3 ? ie_x : (6-ie_x)%3 ); //epsilon[ie_x][0]; //a' + int b_x = (ie_x < 3 ? (ie_x+1)%3 : (8-ie_x)%3 ); //epsilon[ie_x][1]; //b' + int c_x = (ie_x < 3 ? (ie_x+2)%3 : (7-ie_x)%3 ); //epsilon[ie_x][2]; //c' + int eSgn_x = (ie_x < 3 ? 1 : -1); + ee = Real(eSgn_s * eSgn_x); + for (int alpha_x=0; alpha_x template -void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, - const mobj &Du_spec, +void BaryonUtils::XiToSigmaEye(const PropagatorField &qq_loop, + const mobj &Dd_spec, + const mobj &Ds_spec, const PropagatorField &qd_tf, const PropagatorField &qs_ti, const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, + const Gamma GammaB_xi, + const Gamma GammaB_sigma, const std::string op, - SpinMatrixField &stn_corr) + SpinMatrixField &xts_corr) { assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); @@ -1018,67 +1539,38 @@ void BaryonUtils::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop, GridBase *grid = qs_ti.Grid(); - autoView( vcorr, stn_corr, CpuWrite); - autoView( vq_loop , qq_loop, CpuRead); - autoView( vd_tf , qd_tf, CpuRead); - autoView( vs_ti , qs_ti, CpuRead); + autoView( vcorr , xts_corr , AcceleratorWrite); + autoView( vq_loop , qq_loop , AcceleratorRead); + autoView( vd_tf , qd_tf , AcceleratorRead); + autoView( vs_ti , qs_ti , AcceleratorRead); - accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - auto Dq_loop = vq_loop[ss]; - auto Dd_tf = vd_tf[ss]; - auto Ds_ti = vs_ti[ss]; - sobj result=Zero(); - if(op == "Q1"){ - Sigma_to_Nucleon_Q1_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); - } else if(op == "Q2"){ - Sigma_to_Nucleon_Q2_Eye_site(Dq_loop,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); - } else { - assert(0 && "Weak Operator not correctly specified"); - } - vcorr[ss] = result; - } );//end loop over lattice sites + Vector my_Dq_spec{Dd_spec,Ds_spec}; + mobj * Dq_spec_p = &my_Dq_spec[0]; + + if(op == "Q1"){ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_loop = vq_loop(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + XiToSigmaQ1EyeSite(Dq_loop,Dq_spec_p[0],Dq_spec_p[1],Dd_tf,Ds_ti,Gamma_H,GammaB_xi,GammaB_sigma,result); + coalescedWrite(vcorr[ss],result); + } );//end loop over lattice sites + } else if(op == "Q2"){ + accelerator_for(ss, grid->oSites(), grid->Nsimd(), { + auto Dq_loop = vq_loop(ss); + auto Dd_tf = vd_tf(ss); + auto Ds_ti = vs_ti(ss); + typedef decltype(coalescedRead(vcorr[0])) spinor; + spinor result=Zero(); + XiToSigmaQ2EyeSite(Dq_loop,Dq_spec_p[0],Dq_spec_p[1],Dd_tf,Ds_ti,Gamma_H,GammaB_xi,GammaB_sigma,result); + coalescedWrite(vcorr[ss],result); + } );//end loop over lattice sites + } else { + assert(0 && "Weak Operator not correctly specified"); + } } -template -template -void BaryonUtils::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti, - const PropagatorField &qq_tf, - const mobj &Du_spec, - const PropagatorField &qd_tf, - const PropagatorField &qs_ti, - const Gamma Gamma_H, - const Gamma GammaB_sigma, - const Gamma GammaB_nucl, - const std::string op, - SpinMatrixField &stn_corr) -{ - - assert(Ns==4 && "Baryon code only implemented for N_spin = 4"); - assert(Nc==3 && "Baryon code only implemented for N_colour = 3"); - - GridBase *grid = qs_ti.Grid(); - - autoView( vcorr , stn_corr, CpuWrite); - autoView( vq_ti , qq_ti, CpuRead); - autoView( vq_tf , qq_tf, CpuRead); - autoView( vd_tf , qd_tf, CpuRead); - autoView( vs_ti , qs_ti, CpuRead); - // accelerator_for(ss, grid->oSites(), grid->Nsimd(), { - thread_for(ss,grid->oSites(),{ - auto Dq_ti = vq_ti[ss]; - auto Dq_tf = vq_tf[ss]; - auto Dd_tf = vd_tf[ss]; - auto Ds_ti = vs_ti[ss]; - sobj result=Zero(); - if(op == "Q1"){ - Sigma_to_Nucleon_Q1_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); - } else if(op == "Q2"){ - Sigma_to_Nucleon_Q2_NonEye_site(Dq_ti,Dq_tf,Du_spec,Dd_tf,Ds_ti,Gamma_H,GammaB_sigma,GammaB_nucl,result); - } else { - assert(0 && "Weak Operator not correctly specified"); - } - vcorr[ss] = result; - } );//end loop over lattice sites -} NAMESPACE_END(Grid); diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h index cee1fa12..6c70706f 100644 --- a/Grid/qcd/utils/CovariantCshift.h +++ b/Grid/qcd/utils/CovariantCshift.h @@ -53,6 +53,24 @@ namespace PeriodicBC { return Cshift(tmp,mu,-1);// moves towards positive mu } + template Lattice + CovShiftIdentityBackward(const Lattice &Link, int mu) + { + return Cshift(adj(Link), mu, -1); + } + + template Lattice + CovShiftIdentityForward(const Lattice &Link, int mu) + { + return Link; + } + + template Lattice + ShiftStaple(const Lattice &Link, int mu) + { + return Cshift(Link, mu, 1); + } + template::value,void>::type * = nullptr> auto CovShiftForward(const Lattice &Link, int mu, @@ -70,6 +88,7 @@ namespace PeriodicBC { return CovShiftBackward(Link,mu,arg); } + } @@ -139,6 +158,38 @@ namespace ConjugateBC { // std::cout<<"Gparity::CovCshiftBackward mu="< Lattice + CovShiftIdentityBackward(const Lattice &Link, int mu) { + GridBase *grid = Link.Grid(); + int Lmu = grid->GlobalDimensions()[mu] - 1; + + Lattice> coor(grid); + LatticeCoordinate(coor, mu); + + Lattice tmp(grid); + tmp = adj(Link); + tmp = where(coor == Lmu, conjugate(tmp), tmp); + return Cshift(tmp, mu, -1); // moves towards positive mu + } + template Lattice + CovShiftIdentityForward(const Lattice &Link, int mu) { + return Link; + } + + template Lattice + ShiftStaple(const Lattice &Link, int mu) + { + GridBase *grid = Link.Grid(); + int Lmu = grid->GlobalDimensions()[mu] - 1; + + Lattice> coor(grid); + LatticeCoordinate(coor, mu); + + Lattice tmp(grid); + tmp = Cshift(Link, mu, 1); + tmp = where(coor == Lmu, conjugate(tmp), tmp); + return tmp; + } template::value,void>::type * = nullptr> auto CovShiftForward(const Lattice &Link, diff --git a/Grid/qcd/utils/LinalgUtils.h b/Grid/qcd/utils/LinalgUtils.h index 1e016e4e..964b83d5 100644 --- a/Grid/qcd/utils/LinalgUtils.h +++ b/Grid/qcd/utils/LinalgUtils.h @@ -154,8 +154,8 @@ void axpby_ssp_pminus(Lattice &z,Coeff a,const Lattice &x,Coeff b,co accelerator_for(sss,nloop,vobj::Nsimd(),{ uint64_t ss = sss*Ls; decltype(coalescedRead(y_v[ss+sp])) tmp; - spProj5m(tmp,y_v(ss+sp)); - tmp = a*x_v(ss+s)+b*tmp; + spProj5m(tmp,y_v(ss+sp)); + tmp = a*x_v(ss+s)+b*tmp; coalescedWrite(z_v[ss+s],tmp); }); } @@ -188,7 +188,6 @@ void G5R5(Lattice &z,const Lattice &x) z.Checkerboard() = x.Checkerboard(); conformable(x,z); int Ls = grid->_rdimensions[0]; - Gamma G5(Gamma::Algebra::Gamma5); autoView( x_v, x, AcceleratorRead); autoView( z_v, z, AcceleratorWrite); uint64_t nloop = grid->oSites()/Ls; @@ -196,7 +195,13 @@ void G5R5(Lattice &z,const Lattice &x) uint64_t ss = sss*Ls; for(int s=0;s &z, const Lattice &x) z.Checkerboard() = x.Checkerboard(); conformable(x, z); - Gamma G5(Gamma::Algebra::Gamma5); - z = G5 * x; + autoView( x_v, x, AcceleratorRead); + autoView( z_v, z, AcceleratorWrite); + uint64_t nloop = grid->oSites(); + accelerator_for(ss,nloop,vobj::Nsimd(),{ + auto tmp = x_v(ss); + decltype(tmp) tmp_p; + decltype(tmp) tmp_m; + spProj5p(tmp_p,tmp); + spProj5m(tmp_m,tmp); + coalescedWrite(z_v[ss],tmp_p - tmp_m); + }); } +/* template void G5C(Lattice> &z, const Lattice> &x) { @@ -234,6 +249,7 @@ void G5C(Lattice> &z, const Lattice& M): M(M), Mom(grid), AuxMom(grid), AuxField(grid){} // Correct - void MomentaDistribution(GridParallelRNG& pRNG){ + void MomentaDistribution(GridSerialRNG & sRNG, GridParallelRNG& pRNG){ // Generate a distribution for // P^dag G P // where G = M^-1 // Generate gaussian momenta - Implementation::generate_momenta(Mom, pRNG); + Implementation::generate_momenta(Mom, sRNG, pRNG); // Modify the distribution with the metric M.MSquareRoot(Mom); @@ -107,8 +107,8 @@ public: // Auxiliary momenta // do nothing if trivial, so hide in the metric MomentaField AuxMomTemp(Mom.Grid()); - Implementation::generate_momenta(AuxMom, pRNG); - Implementation::generate_momenta(AuxField, pRNG); + Implementation::generate_momenta(AuxMom, sRNG, pRNG); + Implementation::generate_momenta(AuxField, sRNG, pRNG); // Modify the distribution with the metric // Aux^dag M Aux M.MInvSquareRoot(AuxMom); // AuxMom = M^{-1/2} AuxMomTemp diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h index 0cc0cc1a..675493b3 100644 --- a/Grid/qcd/utils/SUn.h +++ b/Grid/qcd/utils/SUn.h @@ -449,7 +449,8 @@ public: LatticeReal alpha(grid); // std::cout< static void HotConfiguration(GridParallelRNG &pRNG, GaugeField &out) { typedef typename GaugeField::vector_type vector_type; @@ -799,6 +799,88 @@ public: } }; +template +LatticeComplexD Determinant(const Lattice > > > &Umu) +{ + GridBase *grid=Umu.Grid(); + auto lvol = grid->lSites(); + LatticeComplexD ret(grid); + + autoView(Umu_v,Umu,CpuRead); + autoView(ret_v,ret,CpuWrite); + thread_for(site,lvol,{ + Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N); + Coordinate lcoor; + grid->LocalIndexToLocalCoor(site, lcoor); + iScalar > > Us; + peekLocalSite(Us, Umu_v, lcoor); + for(int i=0;i +static void ProjectSUn(Lattice > > > &Umu) +{ + Umu = ProjectOnGroup(Umu); + auto det = Determinant(Umu); + + det = conjugate(det); + + for(int i=0;i(Umu,N-1,i); + element = element * det; + PokeIndex(Umu,element,Nc-1,i); + } +} +template +static void ProjectSUn(Lattice >,Nd> > &U) +{ + GridBase *grid=U.Grid(); + // Reunitarise + for(int mu=0;mu(U,mu); + Umu = ProjectOnGroup(Umu); + ProjectSUn(Umu); + PokeIndex(U,Umu,mu); + } +} +// Explicit specialisation for SU(3). +// Explicit specialisation for SU(3). +static void +ProjectSU3 (Lattice > > > &Umu) +{ + GridBase *grid=Umu.Grid(); + const int x=0; + const int y=1; + const int z=2; + // Reunitarise + Umu = ProjectOnGroup(Umu); + autoView(Umu_v,Umu,CpuWrite); + thread_for(ss,grid->oSites(),{ + auto cm = Umu_v[ss]; + cm()()(2,x) = adj(cm()()(0,y)*cm()()(1,z)-cm()()(0,z)*cm()()(1,y)); //x= yz-zy + cm()()(2,y) = adj(cm()()(0,z)*cm()()(1,x)-cm()()(0,x)*cm()()(1,z)); //y= zx-xz + cm()()(2,z) = adj(cm()()(0,x)*cm()()(1,y)-cm()()(0,y)*cm()()(1,x)); //z= xy-yx + Umu_v[ss]=cm; + }); +} +static void ProjectSU3(Lattice >,Nd> > &U) +{ + GridBase *grid=U.Grid(); + // Reunitarise + for(int mu=0;mu(U,mu); + Umu = ProjectOnGroup(Umu); + ProjectSU3(Umu); + PokeIndex(U,Umu,mu); + } +} + typedef SU<2> SU2; typedef SU<3> SU3; typedef SU<4> SU4; diff --git a/Grid/serialisation/JSON_IO.cc b/Grid/serialisation/JSON_IO.cc index aca8bab3..f2282099 100644 --- a/Grid/serialisation/JSON_IO.cc +++ b/Grid/serialisation/JSON_IO.cc @@ -26,7 +26,7 @@ *************************************************************************************/ /* END LEGAL */ #include -#ifndef __NVCC__ +#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) NAMESPACE_BEGIN(Grid); diff --git a/Grid/simd/Fujitsu_A64FX_asm_double.h b/Grid/simd/Fujitsu_A64FX_asm_double.h deleted file mode 100644 index 76c556d7..00000000 --- a/Grid/simd/Fujitsu_A64FX_asm_double.h +++ /dev/null @@ -1,779 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: Fujitsu_A64FX_asm_double.h - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXd(base) -#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXd(A) -#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) -#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXd(A) -#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) -#define PF_GAUGE(A) -#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(A) -#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(A) -#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define LOCK_GAUGE(A) -#define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXd -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) -#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) -#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd -#define LOAD_CHI(base) LOAD_CHI_A64FXd(base) -#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) -#define XP_PROJ XP_PROJ_A64FXd -#define YP_PROJ YP_PROJ_A64FXd -#define ZP_PROJ ZP_PROJ_A64FXd -#define TP_PROJ TP_PROJ_A64FXd -#define XM_PROJ XM_PROJ_A64FXd -#define YM_PROJ YM_PROJ_A64FXd -#define ZM_PROJ ZM_PROJ_A64FXd -#define TM_PROJ TM_PROJ_A64FXd -#define XP_RECON XP_RECON_A64FXd -#define XM_RECON XM_RECON_A64FXd -#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXd -#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXd -#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXd -#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXd -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXd -#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXd -#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXd -#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXd -#define PERMUTE_DIR0 0 -#define PERMUTE_DIR1 1 -#define PERMUTE_DIR2 2 -#define PERMUTE_DIR3 3 -#define PERMUTE PERMUTE_A64FXd; -#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1; } else if (Dir == 2) { LOAD_TABLE2; } -#define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } -// DECLARATIONS -#define DECLARATIONS_A64FXd \ - const uint64_t lut[4][8] = { \ - {4, 5, 6, 7, 0, 1, 2, 3}, \ - {2, 3, 0, 1, 6, 7, 4, 5}, \ - {1, 0, 3, 2, 5, 4, 7, 6}, \ - {0, 1, 2, 4, 5, 6, 7, 8} };\ -asm ( \ - "fmov z31.d , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// RESULT -#define RESULT_A64FXd(base) \ -{ \ -asm ( \ - "str z0, [%[storeptr], -6, mul vl] \n\t" \ - "str z1, [%[storeptr], -5, mul vl] \n\t" \ - "str z2, [%[storeptr], -4, mul vl] \n\t" \ - "str z3, [%[storeptr], -3, mul vl] \n\t" \ - "str z4, [%[storeptr], -2, mul vl] \n\t" \ - "str z5, [%[storeptr], -1, mul vl] \n\t" \ - "str z6, [%[storeptr], 0, mul vl] \n\t" \ - "str z7, [%[storeptr], 1, mul vl] \n\t" \ - "str z8, [%[storeptr], 2, mul vl] \n\t" \ - "str z9, [%[storeptr], 3, mul vl] \n\t" \ - "str z10, [%[storeptr], 4, mul vl] \n\t" \ - "str z11, [%[storeptr], 5, mul vl] \n\t" \ - : \ - : [storeptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L2 (prefetch to L2) -#define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L1 (prefetch to L1) -#define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L2 (prefetch to L2) -#define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ -{ \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ -{ \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHI -#define LOAD_CHI_A64FXd(base) \ -{ \ -asm ( \ - "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU -#define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ -{ \ -asm ( \ - "ptrue p5.d \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0213 -#define LOAD_CHIMU_0213_A64FXd \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ptrue p5.d \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0312 -#define LOAD_CHIMU_0312_A64FXd \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ptrue p5.d \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_TABLE0 -#define LOAD_TABLE0 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE1 -#define LOAD_TABLE1 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE2 -#define LOAD_TABLE2 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE3 -#define LOAD_TABLE3 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (3) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERMUTE -#define PERMUTE_A64FXd \ -asm ( \ - "tbl z12.d, { z12.d }, z30.d \n\t" \ - "tbl z13.d, { z13.d }, z30.d \n\t" \ - "tbl z14.d, { z14.d }, z30.d \n\t" \ - "tbl z15.d, { z15.d }, z30.d \n\t" \ - "tbl z16.d, { z16.d }, z30.d \n\t" \ - "tbl z17.d, { z17.d }, z30.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ -{ \ -asm ( \ - "ptrue p5.d \n\t" \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN -#define MULT_2SPIN_1_A64FXd(A) \ -{ \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ -asm ( \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ - "movprfx z18.d, p5/m, z31.d \n\t" \ - "fcmla z18.d, p5/m, z24.d, z12.d, 0 \n\t" \ - "movprfx z21.d, p5/m, z31.d \n\t" \ - "fcmla z21.d, p5/m, z24.d, z15.d, 0 \n\t" \ - "movprfx z19.d, p5/m, z31.d \n\t" \ - "fcmla z19.d, p5/m, z25.d, z12.d, 0 \n\t" \ - "movprfx z22.d, p5/m, z31.d \n\t" \ - "fcmla z22.d, p5/m, z25.d, z15.d, 0 \n\t" \ - "movprfx z20.d, p5/m, z31.d \n\t" \ - "fcmla z20.d, p5/m, z26.d, z12.d, 0 \n\t" \ - "movprfx z23.d, p5/m, z31.d \n\t" \ - "fcmla z23.d, p5/m, z26.d, z15.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z12.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z15.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z12.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z15.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z12.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z15.d, 90 \n\t" \ - "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN_BACKEND -#define MULT_2SPIN_2_A64FXd \ -{ \ -asm ( \ - "fcmla z18.d, p5/m, z27.d, z13.d, 0 \n\t" \ - "fcmla z21.d, p5/m, z27.d, z16.d, 0 \n\t" \ - "fcmla z19.d, p5/m, z28.d, z13.d, 0 \n\t" \ - "fcmla z22.d, p5/m, z28.d, z16.d, 0 \n\t" \ - "fcmla z20.d, p5/m, z29.d, z13.d, 0 \n\t" \ - "fcmla z23.d, p5/m, z29.d, z16.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z27.d, z13.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z27.d, z16.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z28.d, z13.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z28.d, z16.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z29.d, z13.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z29.d, z16.d, 90 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z14.d, 0 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z17.d, 0 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z14.d, 0 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z17.d, 0 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z14.d, 0 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z17.d, 0 \n\t" \ - "fcmla z18.d, p5/m, z24.d, z14.d, 90 \n\t" \ - "fcmla z21.d, p5/m, z24.d, z17.d, 90 \n\t" \ - "fcmla z19.d, p5/m, z25.d, z14.d, 90 \n\t" \ - "fcmla z22.d, p5/m, z25.d, z17.d, 90 \n\t" \ - "fcmla z20.d, p5/m, z26.d, z14.d, 90 \n\t" \ - "fcmla z23.d, p5/m, z26.d, z17.d, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_PROJ -#define XP_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z21.d, 90 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z22.d, 90 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z23.d, 90 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z18.d, 90 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z19.d, 90 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z20.d, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_RECON -#define XP_RECON_A64FXd \ -asm ( \ - "movprfx z6.d, p5/m, z31.d \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ - "movprfx z7.d, p5/m, z31.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ - "movprfx z8.d, p5/m, z31.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ - "movprfx z9.d, p5/m, z31.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ - "movprfx z10.d, p5/m, z31.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ - "movprfx z11.d, p5/m, z31.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ - "mov z0.d, p5/m, z18.d \n\t" \ - "mov z1.d, p5/m, z19.d \n\t" \ - "mov z2.d, p5/m, z20.d \n\t" \ - "mov z3.d, p5/m, z21.d \n\t" \ - "mov z4.d, p5/m, z22.d \n\t" \ - "mov z5.d, p5/m, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// XP_RECON_ACCUM -#define XP_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z9.d, p5/m, z9.d, z18.d, 270 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 270 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 270 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 270 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 270 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 270 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_PROJ -#define YP_PROJ_A64FXd \ -{ \ -asm ( \ - "fsub z12.d, p5/m, z12.d, z21.d \n\t" \ - "fsub z13.d, p5/m, z13.d, z22.d \n\t" \ - "fsub z14.d, p5/m, z14.d, z23.d \n\t" \ - "fadd z15.d, p5/m, z15.d, z18.d \n\t" \ - "fadd z16.d, p5/m, z16.d, z19.d \n\t" \ - "fadd z17.d, p5/m, z17.d, z20.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZP_PROJ -#define ZP_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z18.d, 90 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z19.d, 90 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z20.d, 90 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z21.d, 270 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z22.d, 270 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z23.d, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TP_PROJ -#define TP_PROJ_A64FXd \ -{ \ -asm ( \ - "fadd z12.d, p5/m, z12.d, z18.d \n\t" \ - "fadd z13.d, p5/m, z13.d, z19.d \n\t" \ - "fadd z14.d, p5/m, z14.d, z20.d \n\t" \ - "fadd z15.d, p5/m, z15.d, z21.d \n\t" \ - "fadd z16.d, p5/m, z16.d, z22.d \n\t" \ - "fadd z17.d, p5/m, z17.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_PROJ -#define XM_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z21.d, 270 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z22.d, 270 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z23.d, 270 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z18.d, 270 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z19.d, 270 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z20.d, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON -#define XM_RECON_A64FXd \ -asm ( \ - "movprfx z6.d, p5/m, z31.d \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ - "movprfx z7.d, p5/m, z31.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ - "movprfx z8.d, p5/m, z31.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ - "movprfx z9.d, p5/m, z31.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ - "movprfx z10.d, p5/m, z31.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ - "movprfx z11.d, p5/m, z31.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ - "mov z0.d, p5/m, z18.d \n\t" \ - "mov z1.d, p5/m, z19.d \n\t" \ - "mov z2.d, p5/m, z20.d \n\t" \ - "mov z3.d, p5/m, z21.d \n\t" \ - "mov z4.d, p5/m, z22.d \n\t" \ - "mov z5.d, p5/m, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_PROJ -#define YM_PROJ_A64FXd \ -{ \ -asm ( \ - "fadd z12.d, p5/m, z12.d, z21.d \n\t" \ - "fadd z13.d, p5/m, z13.d, z22.d \n\t" \ - "fadd z14.d, p5/m, z14.d, z23.d \n\t" \ - "fsub z15.d, p5/m, z15.d, z18.d \n\t" \ - "fsub z16.d, p5/m, z16.d, z19.d \n\t" \ - "fsub z17.d, p5/m, z17.d, z20.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZM_PROJ -#define ZM_PROJ_A64FXd \ -{ \ -asm ( \ - "fcadd z12.d, p5/m, z12.d, z18.d, 270 \n\t" \ - "fcadd z13.d, p5/m, z13.d, z19.d, 270 \n\t" \ - "fcadd z14.d, p5/m, z14.d, z20.d, 270 \n\t" \ - "fcadd z15.d, p5/m, z15.d, z21.d, 90 \n\t" \ - "fcadd z16.d, p5/m, z16.d, z22.d, 90 \n\t" \ - "fcadd z17.d, p5/m, z17.d, z23.d, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TM_PROJ -#define TM_PROJ_A64FXd \ -{ \ -asm ( \ - "ptrue p5.d \n\t" \ - "fsub z12.d, p5/m, z12.d, z18.d \n\t" \ - "fsub z13.d, p5/m, z13.d, z19.d \n\t" \ - "fsub z14.d, p5/m, z14.d, z20.d \n\t" \ - "fsub z15.d, p5/m, z15.d, z21.d \n\t" \ - "fsub z16.d, p5/m, z16.d, z22.d \n\t" \ - "fsub z17.d, p5/m, z17.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON_ACCUM -#define XM_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z9.d, p5/m, z9.d, z18.d, 90 \n\t" \ - "fcadd z10.d, p5/m, z10.d, z19.d, 90 \n\t" \ - "fcadd z11.d, p5/m, z11.d, z20.d, 90 \n\t" \ - "fcadd z6.d, p5/m, z6.d, z21.d, 90 \n\t" \ - "fcadd z7.d, p5/m, z7.d, z22.d, 90 \n\t" \ - "fcadd z8.d, p5/m, z8.d, z23.d, 90 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_RECON_ACCUM -#define YP_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fsub z9.d, p5/m, z9.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fsub z10.d, p5/m, z10.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fsub z11.d, p5/m, z11.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_RECON_ACCUM -#define YM_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fsub z6.d, p5/m, z6.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fsub z7.d, p5/m, z7.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fsub z8.d, p5/m, z8.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZP_RECON_ACCUM -#define ZP_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z6.d, p5/m, z6.d, z18.d, 270 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z19.d, 270 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z20.d, 270 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z21.d, 90 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z22.d, 90 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z23.d, 90 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZM_RECON_ACCUM -#define ZM_RECON_ACCUM_A64FXd \ -asm ( \ - "fcadd z6.d, p5/m, z6.d, z18.d, 90 \n\t" \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fcadd z7.d, p5/m, z7.d, z19.d, 90 \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fcadd z8.d, p5/m, z8.d, z20.d, 90 \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fcadd z9.d, p5/m, z9.d, z21.d, 270 \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fcadd z10.d, p5/m, z10.d, z22.d, 270 \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fcadd z11.d, p5/m, z11.d, z23.d, 270 \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TP_RECON_ACCUM -#define TP_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TM_RECON_ACCUM -#define TM_RECON_ACCUM_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z18.d \n\t" \ - "fsub z6.d, p5/m, z6.d, z18.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z19.d \n\t" \ - "fsub z7.d, p5/m, z7.d, z19.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z20.d \n\t" \ - "fsub z8.d, p5/m, z8.d, z20.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z21.d \n\t" \ - "fsub z9.d, p5/m, z9.d, z21.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z22.d \n\t" \ - "fsub z10.d, p5/m, z10.d, z22.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z23.d \n\t" \ - "fsub z11.d, p5/m, z11.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZERO_PSI -#define ZERO_PSI_A64FXd \ -asm ( \ - "ptrue p5.d \n\t" \ - "fmov z0.d , 0 \n\t" \ - "fmov z1.d , 0 \n\t" \ - "fmov z2.d , 0 \n\t" \ - "fmov z3.d , 0 \n\t" \ - "fmov z4.d , 0 \n\t" \ - "fmov z5.d , 0 \n\t" \ - "fmov z6.d , 0 \n\t" \ - "fmov z7.d , 0 \n\t" \ - "fmov z8.d , 0 \n\t" \ - "fmov z9.d , 0 \n\t" \ - "fmov z10.d , 0 \n\t" \ - "fmov z11.d , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) -#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_RESULT_L1_STORE (prefetch store to L1) -#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \ -{ \ -asm ( \ - "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// ADD_RESULT_INTERNAL -#define ADD_RESULT_INTERNAL_A64FXd \ -asm ( \ - "fadd z0.d, p5/m, z0.d, z12.d \n\t" \ - "fadd z1.d, p5/m, z1.d, z13.d \n\t" \ - "fadd z2.d, p5/m, z2.d, z14.d \n\t" \ - "fadd z3.d, p5/m, z3.d, z15.d \n\t" \ - "fadd z4.d, p5/m, z4.d, z16.d \n\t" \ - "fadd z5.d, p5/m, z5.d, z17.d \n\t" \ - "fadd z6.d, p5/m, z6.d, z18.d \n\t" \ - "fadd z7.d, p5/m, z7.d, z19.d \n\t" \ - "fadd z8.d, p5/m, z8.d, z20.d \n\t" \ - "fadd z9.d, p5/m, z9.d, z21.d \n\t" \ - "fadd z10.d, p5/m, z10.d, z22.d \n\t" \ - "fadd z11.d, p5/m, z11.d, z23.d \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - diff --git a/Grid/simd/Fujitsu_A64FX_asm_single.h b/Grid/simd/Fujitsu_A64FX_asm_single.h deleted file mode 100644 index d809f83b..00000000 --- a/Grid/simd/Fujitsu_A64FX_asm_single.h +++ /dev/null @@ -1,779 +0,0 @@ -/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: Fujitsu_A64FX_asm_single.h - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -#define LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_A64FXf(base) -#define PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_A64FXf(A) -#define PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) -#define PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_A64FXf(A) -#define PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) -#define PF_GAUGE(A) -#define PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(A) -#define PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(A) -#define PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A) -#define LOCK_GAUGE(A) -#define UNLOCK_GAUGE(A) -#define MASK_REGS DECLARATIONS_A64FXf -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) -#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) -#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf -#define LOAD_CHI(base) LOAD_CHI_A64FXf(base) -#define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) -#define XP_PROJ XP_PROJ_A64FXf -#define YP_PROJ YP_PROJ_A64FXf -#define ZP_PROJ ZP_PROJ_A64FXf -#define TP_PROJ TP_PROJ_A64FXf -#define XM_PROJ XM_PROJ_A64FXf -#define YM_PROJ YM_PROJ_A64FXf -#define ZM_PROJ ZM_PROJ_A64FXf -#define TM_PROJ TM_PROJ_A64FXf -#define XP_RECON XP_RECON_A64FXf -#define XM_RECON XM_RECON_A64FXf -#define XM_RECON_ACCUM XM_RECON_ACCUM_A64FXf -#define YM_RECON_ACCUM YM_RECON_ACCUM_A64FXf -#define ZM_RECON_ACCUM ZM_RECON_ACCUM_A64FXf -#define TM_RECON_ACCUM TM_RECON_ACCUM_A64FXf -#define XP_RECON_ACCUM XP_RECON_ACCUM_A64FXf -#define YP_RECON_ACCUM YP_RECON_ACCUM_A64FXf -#define ZP_RECON_ACCUM ZP_RECON_ACCUM_A64FXf -#define TP_RECON_ACCUM TP_RECON_ACCUM_A64FXf -#define PERMUTE_DIR0 0 -#define PERMUTE_DIR1 1 -#define PERMUTE_DIR2 2 -#define PERMUTE_DIR3 3 -#define PERMUTE PERMUTE_A64FXf; -#define LOAD_TABLE(Dir) if (Dir == 0) { LOAD_TABLE0; } else if (Dir == 1) { LOAD_TABLE1 } else if (Dir == 2) { LOAD_TABLE2; } else if (Dir == 3) { LOAD_TABLE3; } -#define MAYBEPERM(A,perm) if (perm) { PERMUTE; } -// DECLARATIONS -#define DECLARATIONS_A64FXf \ - const uint32_t lut[4][16] = { \ - {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ - {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ - {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \ - {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \ -asm ( \ - "fmov z31.s , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// RESULT -#define RESULT_A64FXf(base) \ -{ \ -asm ( \ - "str z0, [%[storeptr], -6, mul vl] \n\t" \ - "str z1, [%[storeptr], -5, mul vl] \n\t" \ - "str z2, [%[storeptr], -4, mul vl] \n\t" \ - "str z3, [%[storeptr], -3, mul vl] \n\t" \ - "str z4, [%[storeptr], -2, mul vl] \n\t" \ - "str z5, [%[storeptr], -1, mul vl] \n\t" \ - "str z6, [%[storeptr], 0, mul vl] \n\t" \ - "str z7, [%[storeptr], 1, mul vl] \n\t" \ - "str z8, [%[storeptr], 2, mul vl] \n\t" \ - "str z9, [%[storeptr], 3, mul vl] \n\t" \ - "str z10, [%[storeptr], 4, mul vl] \n\t" \ - "str z11, [%[storeptr], 5, mul vl] \n\t" \ - : \ - : [storeptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L2 (prefetch to L2) -#define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_CHIMU_L1 (prefetch to L1) -#define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L2 (prefetch to L2) -#define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ -{ \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ -asm ( \ - "prfd PLDL2STRM, p5, [%[fetchptr], -4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 12, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 16, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 20, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 24, mul vl] \n\t" \ - "prfd PLDL2STRM, p5, [%[fetchptr], 28, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_GAUGE_L1 (prefetch to L1) -#define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ -{ \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ -asm ( \ - "prfd PLDL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PLDL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHI -#define LOAD_CHI_A64FXf(base) \ -{ \ -asm ( \ - "ldr z12, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU -#define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ -{ \ -asm ( \ - "ptrue p5.s \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0213 -#define LOAD_CHIMU_0213_A64FXf \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ptrue p5.s \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_CHIMU_0312 -#define LOAD_CHIMU_0312_A64FXf \ -{ \ - const SiteSpinor & ref(in[offset]); \ -asm ( \ - "ptrue p5.s \n\t" \ - "ldr z12, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z21, [%[fetchptr], 3, mul vl] \n\t" \ - "ldr z13, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z22, [%[fetchptr], 4, mul vl] \n\t" \ - "ldr z14, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z23, [%[fetchptr], 5, mul vl] \n\t" \ - "ldr z15, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z18, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z16, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z19, [%[fetchptr], 1, mul vl] \n\t" \ - "ldr z17, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z20, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (&ref[2][0]) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// LOAD_TABLE0 -#define LOAD_TABLE0 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (0) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE1 -#define LOAD_TABLE1 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (1) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE2 -#define LOAD_TABLE2 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (2) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_TABLE3 -#define LOAD_TABLE3 \ -asm ( \ - "ldr z30, [%[tableptr], %[index], mul vl] \n\t" \ - : \ - : [tableptr] "r" (&lut[0]),[index] "i" (3) \ - : "memory","cc","p5","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PERMUTE -#define PERMUTE_A64FXf \ -asm ( \ - "tbl z12.s, { z12.s }, z30.s \n\t" \ - "tbl z13.s, { z13.s }, z30.s \n\t" \ - "tbl z14.s, { z14.s }, z30.s \n\t" \ - "tbl z15.s, { z15.s }, z30.s \n\t" \ - "tbl z16.s, { z16.s }, z30.s \n\t" \ - "tbl z17.s, { z17.s }, z30.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ -{ \ -asm ( \ - "ptrue p5.s \n\t" \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN -#define MULT_2SPIN_1_A64FXf(A) \ -{ \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ -asm ( \ - "ldr z24, [%[fetchptr], -6, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -3, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 0, mul vl] \n\t" \ - "ldr z27, [%[fetchptr], -5, mul vl] \n\t" \ - "ldr z28, [%[fetchptr], -2, mul vl] \n\t" \ - "ldr z29, [%[fetchptr], 1, mul vl] \n\t" \ - "movprfx z18.s, p5/m, z31.s \n\t" \ - "fcmla z18.s, p5/m, z24.s, z12.s, 0 \n\t" \ - "movprfx z21.s, p5/m, z31.s \n\t" \ - "fcmla z21.s, p5/m, z24.s, z15.s, 0 \n\t" \ - "movprfx z19.s, p5/m, z31.s \n\t" \ - "fcmla z19.s, p5/m, z25.s, z12.s, 0 \n\t" \ - "movprfx z22.s, p5/m, z31.s \n\t" \ - "fcmla z22.s, p5/m, z25.s, z15.s, 0 \n\t" \ - "movprfx z20.s, p5/m, z31.s \n\t" \ - "fcmla z20.s, p5/m, z26.s, z12.s, 0 \n\t" \ - "movprfx z23.s, p5/m, z31.s \n\t" \ - "fcmla z23.s, p5/m, z26.s, z15.s, 0 \n\t" \ - "fcmla z18.s, p5/m, z24.s, z12.s, 90 \n\t" \ - "fcmla z21.s, p5/m, z24.s, z15.s, 90 \n\t" \ - "fcmla z19.s, p5/m, z25.s, z12.s, 90 \n\t" \ - "fcmla z22.s, p5/m, z25.s, z15.s, 90 \n\t" \ - "fcmla z20.s, p5/m, z26.s, z12.s, 90 \n\t" \ - "fcmla z23.s, p5/m, z26.s, z15.s, 90 \n\t" \ - "ldr z24, [%[fetchptr], -4, mul vl] \n\t" \ - "ldr z25, [%[fetchptr], -1, mul vl] \n\t" \ - "ldr z26, [%[fetchptr], 2, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (baseU + 2 * 3 * 64) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// MULT_2SPIN_BACKEND -#define MULT_2SPIN_2_A64FXf \ -{ \ -asm ( \ - "fcmla z18.s, p5/m, z27.s, z13.s, 0 \n\t" \ - "fcmla z21.s, p5/m, z27.s, z16.s, 0 \n\t" \ - "fcmla z19.s, p5/m, z28.s, z13.s, 0 \n\t" \ - "fcmla z22.s, p5/m, z28.s, z16.s, 0 \n\t" \ - "fcmla z20.s, p5/m, z29.s, z13.s, 0 \n\t" \ - "fcmla z23.s, p5/m, z29.s, z16.s, 0 \n\t" \ - "fcmla z18.s, p5/m, z27.s, z13.s, 90 \n\t" \ - "fcmla z21.s, p5/m, z27.s, z16.s, 90 \n\t" \ - "fcmla z19.s, p5/m, z28.s, z13.s, 90 \n\t" \ - "fcmla z22.s, p5/m, z28.s, z16.s, 90 \n\t" \ - "fcmla z20.s, p5/m, z29.s, z13.s, 90 \n\t" \ - "fcmla z23.s, p5/m, z29.s, z16.s, 90 \n\t" \ - "fcmla z18.s, p5/m, z24.s, z14.s, 0 \n\t" \ - "fcmla z21.s, p5/m, z24.s, z17.s, 0 \n\t" \ - "fcmla z19.s, p5/m, z25.s, z14.s, 0 \n\t" \ - "fcmla z22.s, p5/m, z25.s, z17.s, 0 \n\t" \ - "fcmla z20.s, p5/m, z26.s, z14.s, 0 \n\t" \ - "fcmla z23.s, p5/m, z26.s, z17.s, 0 \n\t" \ - "fcmla z18.s, p5/m, z24.s, z14.s, 90 \n\t" \ - "fcmla z21.s, p5/m, z24.s, z17.s, 90 \n\t" \ - "fcmla z19.s, p5/m, z25.s, z14.s, 90 \n\t" \ - "fcmla z22.s, p5/m, z25.s, z17.s, 90 \n\t" \ - "fcmla z20.s, p5/m, z26.s, z14.s, 90 \n\t" \ - "fcmla z23.s, p5/m, z26.s, z17.s, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_PROJ -#define XP_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z21.s, 90 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z22.s, 90 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z23.s, 90 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z18.s, 90 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z19.s, 90 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z20.s, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XP_RECON -#define XP_RECON_A64FXf \ -asm ( \ - "movprfx z6.s, p5/m, z31.s \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ - "movprfx z7.s, p5/m, z31.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ - "movprfx z8.s, p5/m, z31.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ - "movprfx z9.s, p5/m, z31.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ - "movprfx z10.s, p5/m, z31.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ - "movprfx z11.s, p5/m, z31.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ - "mov z0.s, p5/m, z18.s \n\t" \ - "mov z1.s, p5/m, z19.s \n\t" \ - "mov z2.s, p5/m, z20.s \n\t" \ - "mov z3.s, p5/m, z21.s \n\t" \ - "mov z4.s, p5/m, z22.s \n\t" \ - "mov z5.s, p5/m, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// XP_RECON_ACCUM -#define XP_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z9.s, p5/m, z9.s, z18.s, 270 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 270 \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 270 \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 270 \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 270 \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 270 \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_PROJ -#define YP_PROJ_A64FXf \ -{ \ -asm ( \ - "fsub z12.s, p5/m, z12.s, z21.s \n\t" \ - "fsub z13.s, p5/m, z13.s, z22.s \n\t" \ - "fsub z14.s, p5/m, z14.s, z23.s \n\t" \ - "fadd z15.s, p5/m, z15.s, z18.s \n\t" \ - "fadd z16.s, p5/m, z16.s, z19.s \n\t" \ - "fadd z17.s, p5/m, z17.s, z20.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZP_PROJ -#define ZP_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z18.s, 90 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z19.s, 90 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z20.s, 90 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z21.s, 270 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z22.s, 270 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z23.s, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TP_PROJ -#define TP_PROJ_A64FXf \ -{ \ -asm ( \ - "fadd z12.s, p5/m, z12.s, z18.s \n\t" \ - "fadd z13.s, p5/m, z13.s, z19.s \n\t" \ - "fadd z14.s, p5/m, z14.s, z20.s \n\t" \ - "fadd z15.s, p5/m, z15.s, z21.s \n\t" \ - "fadd z16.s, p5/m, z16.s, z22.s \n\t" \ - "fadd z17.s, p5/m, z17.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_PROJ -#define XM_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z21.s, 270 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z22.s, 270 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z23.s, 270 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z18.s, 270 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z19.s, 270 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z20.s, 270 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON -#define XM_RECON_A64FXf \ -asm ( \ - "movprfx z6.s, p5/m, z31.s \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ - "movprfx z7.s, p5/m, z31.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ - "movprfx z8.s, p5/m, z31.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ - "movprfx z9.s, p5/m, z31.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ - "movprfx z10.s, p5/m, z31.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ - "movprfx z11.s, p5/m, z31.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ - "mov z0.s, p5/m, z18.s \n\t" \ - "mov z1.s, p5/m, z19.s \n\t" \ - "mov z2.s, p5/m, z20.s \n\t" \ - "mov z3.s, p5/m, z21.s \n\t" \ - "mov z4.s, p5/m, z22.s \n\t" \ - "mov z5.s, p5/m, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_PROJ -#define YM_PROJ_A64FXf \ -{ \ -asm ( \ - "fadd z12.s, p5/m, z12.s, z21.s \n\t" \ - "fadd z13.s, p5/m, z13.s, z22.s \n\t" \ - "fadd z14.s, p5/m, z14.s, z23.s \n\t" \ - "fsub z15.s, p5/m, z15.s, z18.s \n\t" \ - "fsub z16.s, p5/m, z16.s, z19.s \n\t" \ - "fsub z17.s, p5/m, z17.s, z20.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// ZM_PROJ -#define ZM_PROJ_A64FXf \ -{ \ -asm ( \ - "fcadd z12.s, p5/m, z12.s, z18.s, 270 \n\t" \ - "fcadd z13.s, p5/m, z13.s, z19.s, 270 \n\t" \ - "fcadd z14.s, p5/m, z14.s, z20.s, 270 \n\t" \ - "fcadd z15.s, p5/m, z15.s, z21.s, 90 \n\t" \ - "fcadd z16.s, p5/m, z16.s, z22.s, 90 \n\t" \ - "fcadd z17.s, p5/m, z17.s, z23.s, 90 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// TM_PROJ -#define TM_PROJ_A64FXf \ -{ \ -asm ( \ - "ptrue p5.s \n\t" \ - "fsub z12.s, p5/m, z12.s, z18.s \n\t" \ - "fsub z13.s, p5/m, z13.s, z19.s \n\t" \ - "fsub z14.s, p5/m, z14.s, z20.s \n\t" \ - "fsub z15.s, p5/m, z15.s, z21.s \n\t" \ - "fsub z16.s, p5/m, z16.s, z22.s \n\t" \ - "fsub z17.s, p5/m, z17.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); \ -} -// XM_RECON_ACCUM -#define XM_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z9.s, p5/m, z9.s, z18.s, 90 \n\t" \ - "fcadd z10.s, p5/m, z10.s, z19.s, 90 \n\t" \ - "fcadd z11.s, p5/m, z11.s, z20.s, 90 \n\t" \ - "fcadd z6.s, p5/m, z6.s, z21.s, 90 \n\t" \ - "fcadd z7.s, p5/m, z7.s, z22.s, 90 \n\t" \ - "fcadd z8.s, p5/m, z8.s, z23.s, 90 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YP_RECON_ACCUM -#define YP_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fsub z9.s, p5/m, z9.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fsub z10.s, p5/m, z10.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fsub z11.s, p5/m, z11.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fadd z6.s, p5/m, z6.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fadd z7.s, p5/m, z7.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fadd z8.s, p5/m, z8.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// YM_RECON_ACCUM -#define YM_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fadd z9.s, p5/m, z9.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fadd z10.s, p5/m, z10.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fadd z11.s, p5/m, z11.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fsub z6.s, p5/m, z6.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fsub z7.s, p5/m, z7.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fsub z8.s, p5/m, z8.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZP_RECON_ACCUM -#define ZP_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z6.s, p5/m, z6.s, z18.s, 270 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z19.s, 270 \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z20.s, 270 \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z21.s, 90 \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z22.s, 90 \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z23.s, 90 \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZM_RECON_ACCUM -#define ZM_RECON_ACCUM_A64FXf \ -asm ( \ - "fcadd z6.s, p5/m, z6.s, z18.s, 90 \n\t" \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fcadd z7.s, p5/m, z7.s, z19.s, 90 \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fcadd z8.s, p5/m, z8.s, z20.s, 90 \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fcadd z9.s, p5/m, z9.s, z21.s, 270 \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fcadd z10.s, p5/m, z10.s, z22.s, 270 \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fcadd z11.s, p5/m, z11.s, z23.s, 270 \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TP_RECON_ACCUM -#define TP_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fadd z7.s, p5/m, z7.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fadd z8.s, p5/m, z8.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fadd z9.s, p5/m, z9.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fadd z10.s, p5/m, z10.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fadd z11.s, p5/m, z11.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// TM_RECON_ACCUM -#define TM_RECON_ACCUM_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z18.s \n\t" \ - "fsub z6.s, p5/m, z6.s, z18.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z19.s \n\t" \ - "fsub z7.s, p5/m, z7.s, z19.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z20.s \n\t" \ - "fsub z8.s, p5/m, z8.s, z20.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z21.s \n\t" \ - "fsub z9.s, p5/m, z9.s, z21.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z22.s \n\t" \ - "fsub z10.s, p5/m, z10.s, z22.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z23.s \n\t" \ - "fsub z11.s, p5/m, z11.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// ZERO_PSI -#define ZERO_PSI_A64FXf \ -asm ( \ - "ptrue p5.s \n\t" \ - "fmov z0.s , 0 \n\t" \ - "fmov z1.s , 0 \n\t" \ - "fmov z2.s , 0 \n\t" \ - "fmov z3.s , 0 \n\t" \ - "fmov z4.s , 0 \n\t" \ - "fmov z5.s , 0 \n\t" \ - "fmov z6.s , 0 \n\t" \ - "fmov z7.s , 0 \n\t" \ - "fmov z8.s , 0 \n\t" \ - "fmov z9.s , 0 \n\t" \ - "fmov z10.s , 0 \n\t" \ - "fmov z11.s , 0 \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) -#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PSTL2STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL2STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// PREFETCH_RESULT_L1_STORE (prefetch store to L1) -#define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \ -{ \ -asm ( \ - "prfd PSTL1STRM, p5, [%[fetchptr], 0, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 4, mul vl] \n\t" \ - "prfd PSTL1STRM, p5, [%[fetchptr], 8, mul vl] \n\t" \ - : \ - : [fetchptr] "r" (base) \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31","memory" \ -); \ -} -// ADD_RESULT_INTERNAL -#define ADD_RESULT_INTERNAL_A64FXf \ -asm ( \ - "fadd z0.s, p5/m, z0.s, z12.s \n\t" \ - "fadd z1.s, p5/m, z1.s, z13.s \n\t" \ - "fadd z2.s, p5/m, z2.s, z14.s \n\t" \ - "fadd z3.s, p5/m, z3.s, z15.s \n\t" \ - "fadd z4.s, p5/m, z4.s, z16.s \n\t" \ - "fadd z5.s, p5/m, z5.s, z17.s \n\t" \ - "fadd z6.s, p5/m, z6.s, z18.s \n\t" \ - "fadd z7.s, p5/m, z7.s, z19.s \n\t" \ - "fadd z8.s, p5/m, z8.s, z20.s \n\t" \ - "fadd z9.s, p5/m, z9.s, z21.s \n\t" \ - "fadd z10.s, p5/m, z10.s, z22.s \n\t" \ - "fadd z11.s, p5/m, z11.s, z23.s \n\t" \ - : \ - : \ - : "p5","cc","z0","z1","z2","z3","z4","z5","z6","z7","z8","z9","z10","z11","z12","z13","z14","z15","z16","z17","z18","z19","z20","z21","z22","z23","z24","z25","z26","z27","z28","z29","z30","z31" \ -); - diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 232610f2..b645c365 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -38,10 +38,11 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXd -#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B) +#define SAVE_RESULT(A,B) RESULT_A64FXd(A); #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd #define LOAD_CHI(base) LOAD_CHI_A64FXd(base) +#define ZERO_PSI ZERO_PSI_A64FXd #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXd; RESULT_A64FXd(base) #define XP_PROJ XP_PROJ_A64FXd #define YP_PROJ YP_PROJ_A64FXd @@ -70,6 +71,7 @@ Author: Nils Meyer #define MAYBEPERM(Dir,perm) if (Dir != 3) { if (perm) { PERMUTE; } } // DECLARATIONS #define DECLARATIONS_A64FXd \ + uint64_t baseU; \ const uint64_t lut[4][8] = { \ {4, 5, 6, 7, 0, 1, 2, 3}, \ {2, 3, 0, 1, 6, 7, 4, 5}, \ @@ -126,114 +128,114 @@ Author: Nils Meyer // RESULT #define RESULT_A64FXd(base) \ { \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ - svst1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31); \ + svst1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32); \ } // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \ } // PREFETCH_CHIMU_L1 (prefetch to L1) #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \ } // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ - svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ + const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \ } // LOAD_CHI #define LOAD_CHI_A64FXd(base) \ { \ - Chi_00 = svld1(pg1, (float64_t*)(base + 0 * 64)); \ - Chi_01 = svld1(pg1, (float64_t*)(base + 1 * 64)); \ - Chi_02 = svld1(pg1, (float64_t*)(base + 2 * 64)); \ - Chi_10 = svld1(pg1, (float64_t*)(base + 3 * 64)); \ - Chi_11 = svld1(pg1, (float64_t*)(base + 4 * 64)); \ - Chi_12 = svld1(pg1, (float64_t*)(base + 5 * 64)); \ + Chi_00 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(0)); \ + Chi_01 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(1)); \ + Chi_02 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(2)); \ + Chi_10 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(3)); \ + Chi_11 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(4)); \ + Chi_12 = svld1_vnum(pg1, (float64_t*)(base), (int64_t)(5)); \ } // LOAD_CHIMU #define LOAD_CHIMU_INTERLEAVED_A64FXd(base) \ { \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_CHIMU_0213 #define LOAD_CHIMU_0213_A64FXd \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ + Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ } // LOAD_CHIMU_0312 #define LOAD_CHIMU_0312_A64FXd \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_01 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_02 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_10 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_11 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_12 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float64_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_01 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_02 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_10 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_11 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_12 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float64_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_TABLE0 #define LOAD_TABLE0 \ @@ -261,26 +263,26 @@ Author: Nils Meyer Chi_12 = svtbl(Chi_12, table0); // LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +#define LOAD_GAUGE(A) \ { \ - U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ } // MULT_2SPIN #define MULT_2SPIN_1_A64FXd(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ @@ -293,9 +295,9 @@ Author: Nils Meyer UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ - U_10 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ - U_20 = svld1(pg1, (float64_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ + U_00 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-4)); \ + U_10 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(-1)); \ + U_20 = svld1_vnum(pg1, (float64_t*)(baseU + 2 * 3 * 64), (int64_t)(2)); \ } // MULT_2SPIN_BACKEND #define MULT_2SPIN_2_A64FXd \ @@ -570,12 +572,12 @@ Author: Nils Meyer result_31 = svdup_f64(0.); \ result_32 = svdup_f64(0.); -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \ } // PREFETCH_RESULT_L1_STORE (prefetch store to L1) #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXd(base) \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 180e5f4f..0b874f02 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -38,10 +38,11 @@ Author: Nils Meyer #define LOCK_GAUGE(A) #define UNLOCK_GAUGE(A) #define MASK_REGS DECLARATIONS_A64FXf -#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B) +#define SAVE_RESULT(A,B) RESULT_A64FXf(A); #define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir) #define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf #define LOAD_CHI(base) LOAD_CHI_A64FXf(base) +#define ZERO_PSI ZERO_PSI_A64FXf #define ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_A64FXf; RESULT_A64FXf(base) #define XP_PROJ XP_PROJ_A64FXf #define YP_PROJ YP_PROJ_A64FXf @@ -70,6 +71,7 @@ Author: Nils Meyer #define MAYBEPERM(A,perm) if (perm) { PERMUTE; } // DECLARATIONS #define DECLARATIONS_A64FXf \ + uint64_t baseU; \ const uint32_t lut[4][16] = { \ {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \ {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \ @@ -126,114 +128,114 @@ Author: Nils Meyer // RESULT #define RESULT_A64FXf(base) \ { \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64), result_00); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64), result_01); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64), result_02); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64), result_10); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64), result_11); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64), result_12); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64), result_20); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64), result_21); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64), result_22); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64), result_30); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64), result_31); \ - svst1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64), result_32); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-6), result_00); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-5), result_01); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-4), result_02); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-3), result_10); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-2), result_11); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(-1), result_12); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(0), result_20); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(1), result_21); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(2), result_22); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(3), result_30); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(4), result_31); \ + svst1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64),(int64_t)(5), result_32); \ } // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \ } // PREFETCH_CHIMU_L1 (prefetch to L1) #define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \ } // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \ - svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ + const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \ } // LOAD_CHI #define LOAD_CHI_A64FXf(base) \ { \ - Chi_00 = svld1(pg1, (float32_t*)(base + 0 * 64)); \ - Chi_01 = svld1(pg1, (float32_t*)(base + 1 * 64)); \ - Chi_02 = svld1(pg1, (float32_t*)(base + 2 * 64)); \ - Chi_10 = svld1(pg1, (float32_t*)(base + 3 * 64)); \ - Chi_11 = svld1(pg1, (float32_t*)(base + 4 * 64)); \ - Chi_12 = svld1(pg1, (float32_t*)(base + 5 * 64)); \ + Chi_00 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(0)); \ + Chi_01 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(1)); \ + Chi_02 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(2)); \ + Chi_10 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(3)); \ + Chi_11 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(4)); \ + Chi_12 = svld1_vnum(pg1, (float32_t*)(base), (int64_t)(5)); \ } // LOAD_CHIMU #define LOAD_CHIMU_INTERLEAVED_A64FXf(base) \ { \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_CHIMU_0213 #define LOAD_CHIMU_0213_A64FXf \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ + Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ } // LOAD_CHIMU_0312 #define LOAD_CHIMU_0312_A64FXf \ { \ const SiteSpinor & ref(in[offset]); \ - Chimu_00 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -6 * 64)); \ - Chimu_30 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 3 * 64)); \ - Chimu_01 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -5 * 64)); \ - Chimu_31 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 4 * 64)); \ - Chimu_02 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -4 * 64)); \ - Chimu_32 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 5 * 64)); \ - Chimu_10 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -3 * 64)); \ - Chimu_20 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 0 * 64)); \ - Chimu_11 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -2 * 64)); \ - Chimu_21 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 1 * 64)); \ - Chimu_12 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + -1 * 64)); \ - Chimu_22 = svld1(pg1, (float32_t*)(base + 2 * 3 * 64 + 2 * 64)); \ + Chimu_00 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-6)); \ + Chimu_30 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(3)); \ + Chimu_01 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-5)); \ + Chimu_31 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(4)); \ + Chimu_02 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-4)); \ + Chimu_32 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(5)); \ + Chimu_10 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-3)); \ + Chimu_20 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(0)); \ + Chimu_11 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-2)); \ + Chimu_21 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(1)); \ + Chimu_12 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(-1)); \ + Chimu_22 = svld1_vnum(pg1, (float32_t*)(base + 2 * 3 * 64), (int64_t)(2)); \ } // LOAD_TABLE0 #define LOAD_TABLE0 \ @@ -261,26 +263,26 @@ Author: Nils Meyer Chi_12 = svtbl(Chi_12, table0); // LOAD_GAUGE -#define LOAD_GAUGE \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ +#define LOAD_GAUGE(A) \ { \ - U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ } // MULT_2SPIN #define MULT_2SPIN_1_A64FXf(A) \ { \ - const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \ - U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -6 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -3 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 0 * 64)); \ - U_01 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -5 * 64)); \ - U_11 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -2 * 64)); \ - U_21 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 1 * 64)); \ + const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ + U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-6)); \ + U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-3)); \ + U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(0)); \ + U_01 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-5)); \ + U_11 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-2)); \ + U_21 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(1)); \ UChi_00 = svcmla_x(pg1, zero0, U_00, Chi_00, 0); \ UChi_10 = svcmla_x(pg1, zero0, U_00, Chi_10, 0); \ UChi_01 = svcmla_x(pg1, zero0, U_10, Chi_00, 0); \ @@ -293,9 +295,9 @@ Author: Nils Meyer UChi_11 = svcmla_x(pg1, UChi_11, U_10, Chi_10, 90); \ UChi_02 = svcmla_x(pg1, UChi_02, U_20, Chi_00, 90); \ UChi_12 = svcmla_x(pg1, UChi_12, U_20, Chi_10, 90); \ - U_00 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -4 * 64)); \ - U_10 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + -1 * 64)); \ - U_20 = svld1(pg1, (float32_t*)(baseU + 2 * 3 * 64 + 2 * 64)); \ + U_00 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-4)); \ + U_10 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(-1)); \ + U_20 = svld1_vnum(pg1, (float32_t*)(baseU + 2 * 3 * 64), (int64_t)(2)); \ } // MULT_2SPIN_BACKEND #define MULT_2SPIN_2_A64FXf \ @@ -570,12 +572,12 @@ Author: Nils Meyer result_31 = svdup_f32(0.); \ result_32 = svdup_f32(0.); -// PREFETCH_RESULT_L2_STORE (prefetch store to L2) +// PREFETCH_RESULT_L2_STORE (uses DC ZVA for cache line zeroing) #define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PSTL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PSTL2STRM); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 0) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 1) : "memory" ); \ + asm( "dc zva, %[fetchptr] \n\t" : : [fetchptr] "r" (base + 256 * 2) : "memory" ); \ } // PREFETCH_RESULT_L1_STORE (prefetch store to L1) #define PREFETCH_RESULT_L1_STORE_INTERNAL_A64FXf(base) \ diff --git a/Grid/simd/Fujitsu_A64FX_undef.h b/Grid/simd/Fujitsu_A64FX_undef.h index 81eec37a..51762a60 100644 --- a/Grid/simd/Fujitsu_A64FX_undef.h +++ b/Grid/simd/Fujitsu_A64FX_undef.h @@ -46,6 +46,7 @@ Author: Nils Meyer #undef MULT_2SPIN_2 #undef MAYBEPERM #undef LOAD_CHI +#undef ZERO_PSI #undef XP_PROJ #undef YP_PROJ #undef ZP_PROJ diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index 8b17f75a..b2c7588f 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -38,12 +38,20 @@ Author: Peter Boyle #ifdef GRID_HIP #include #endif +#ifdef GRID_SYCL +namespace Grid { + typedef struct { uint16_t x;} half; + typedef struct { half x; half y;} half2; + typedef struct { float x; float y;} float2; + typedef struct { double x; double y;} double2; +} +#endif + namespace Grid { -#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) -typedef struct { uint16_t x;} half; -#endif + + typedef struct Half2_t { half x; half y; } Half2; #define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH ) @@ -52,11 +60,26 @@ template class GpuComplex { public: pair z; - typedef decltype(z.x) real; + typedef decltype(z.x) Real; public: accelerator_inline GpuComplex() = default; - accelerator_inline GpuComplex(real re,real im) { z.x=re; z.y=im; }; + accelerator_inline GpuComplex(Real re,Real im) { z.x=re; z.y=im; }; accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;}; + accelerator_inline Real real(void) const { return z.x; }; + accelerator_inline Real imag(void) const { return z.y; }; + accelerator_inline GpuComplex &operator=(const Zero &zz) { z.x = 0; z.y=0; return *this; }; + accelerator_inline GpuComplex &operator*=(const GpuComplex &r) { + *this = (*this) * r; + return *this; + } + accelerator_inline GpuComplex &operator+=(const GpuComplex &r) { + *this = (*this) + r; + return *this; + } + accelerator_inline GpuComplex &operator-=(const GpuComplex &r) { + *this = (*this) - r; + return *this; + } friend accelerator_inline GpuComplex operator+(const GpuComplex &lhs,const GpuComplex &rhs) { GpuComplex r ; r.z.x = lhs.z.x + rhs.z.x; @@ -149,6 +172,11 @@ typedef GpuVector GpuVectorRD; typedef GpuVector GpuVectorCD; typedef GpuVector GpuVectorI; +accelerator_inline GpuComplexF timesI(const GpuComplexF &r) { return(GpuComplexF(-r.imag(),r.real()));} +accelerator_inline GpuComplexD timesI(const GpuComplexD &r) { return(GpuComplexD(-r.imag(),r.real()));} +accelerator_inline GpuComplexF timesMinusI(const GpuComplexF &r){ return(GpuComplexF(r.imag(),-r.real()));} +accelerator_inline GpuComplexD timesMinusI(const GpuComplexD &r){ return(GpuComplexD(r.imag(),-r.real()));} + accelerator_inline float half2float(half h) { float f; @@ -156,7 +184,7 @@ accelerator_inline float half2float(half h) f = __half2float(h); #else Grid_half hh; - hh.x = hr.x; + hh.x = h.x; f= sfw_half_to_float(hh); #endif return f; diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index c07077a3..4f952bb2 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -208,8 +208,8 @@ struct RealPart > { ////////////////////////////////////// // type alias used to simplify the syntax of std::enable_if template using Invoke = typename T::type; -template using EnableIf = Invoke >; -template using NotEnableIf = Invoke >; +template using EnableIf = Invoke >; +template using NotEnableIf = Invoke >; //////////////////////////////////////////////////////// // Check for complexity with type traits diff --git a/Grid/simd/Grid_vector_unops.h b/Grid/simd/Grid_vector_unops.h index d225699b..b89bb785 100644 --- a/Grid/simd/Grid_vector_unops.h +++ b/Grid/simd/Grid_vector_unops.h @@ -125,14 +125,6 @@ accelerator_inline Grid_simd sqrt(const Grid_simd &r) { return SimdApply(SqrtRealFunctor(), r); } template -accelerator_inline Grid_simd rsqrt(const Grid_simd &r) { - return SimdApply(RSqrtRealFunctor(), r); -} -template -accelerator_inline Scalar rsqrt(const Scalar &r) { - return (RSqrtRealFunctor(), r); -} -template accelerator_inline Grid_simd cos(const Grid_simd &r) { return SimdApply(CosRealFunctor(), r); } diff --git a/Grid/simd/Simd.h b/Grid/simd/Simd.h index 1dc86c1b..76ca3bef 100644 --- a/Grid/simd/Simd.h +++ b/Grid/simd/Simd.h @@ -148,10 +148,14 @@ accelerator_inline void sub (ComplexF * __restrict__ y,const ComplexF * __restri accelerator_inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } //conjugate already supported for complex -accelerator_inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));} -accelerator_inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} -accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));} -accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));} +accelerator_inline ComplexF timesI(const ComplexF &r) { return(ComplexF(-r.imag(),r.real()));} +accelerator_inline ComplexD timesI(const ComplexD &r) { return(ComplexD(-r.imag(),r.real()));} +accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(ComplexF(r.imag(),-r.real()));} +accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(ComplexD(r.imag(),-r.real()));} +//accelerator_inline ComplexF timesI(const ComplexF &r) { return(r*ComplexF(0.0,1.0));} +//accelerator_inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} +//accelerator_inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));} +//accelerator_inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));} // define projections to real and imaginay parts accelerator_inline ComplexF projReal(const ComplexF &r){return( ComplexF(r.real(), 0.0));} diff --git a/Grid/simd/gridverter.py b/Grid/simd/gridverter.py deleted file mode 100755 index f00a5019..00000000 --- a/Grid/simd/gridverter.py +++ /dev/null @@ -1,2377 +0,0 @@ -#!/usr/bin/python3 - -import re -import argparse -import sys - -# Grid for A64FX -# -# * should align std::vector to (multiples of) cache block size = 256 bytes - -# place benchmark runtime in cycles here ! -measured_cycles = 690 #1500 #775 #1500 - - -# command line parser -parser = argparse.ArgumentParser(description="Dslash generator.") -parser.add_argument("--single", action="store_true", default="False") -parser.add_argument("--double", action="store_true", default="True") -parser.add_argument("--debug", action="store_true", default="False") -parser.add_argument("--gridbench", action="store_true", default="False") -args = parser.parse_args() - -print(args) - -ASM_LOAD_CHIMU = True # load chimu -ASM_LOAD_GAUGE = True # load gauge -ASM_LOAD_TABLE = True # load table -ASM_STORE = True # store result - -# Disable all loads and stores in asm for benchmarking purposes -#DISABLE_ASM_LOAD_STORE = True -DISABLE_ASM_LOAD_STORE = False - -if DISABLE_ASM_LOAD_STORE: - ASM_LOAD_CHIMU = True # load chimu - ASM_LOAD_GAUGE = True # load gauge - ASM_LOAD_TABLE = True # load table - ASM_STORE = False # store result - -# Alternative implementation using PROJ specific loads works, -# but be careful with predication - -ALTERNATIVE_LOADS = False -#ALTERNATIVE_LOADS = not ALTERNATIVE_LOADS # True - -# Alternative register mapping, -# must use with my_wilson4.h and my_wilson4pf.h - -ALTERNATIVE_REGISTER_MAPPING = False -#ALTERNATIVE_REGISTER_MAPPING = not ALTERNATIVE_REGISTER_MAPPING - -if ALTERNATIVE_REGISTER_MAPPING == True: - ALTERNATIVE_LOADS = False - -# use movprfx -MOVPRFX = False -MOVPRFX = not MOVPRFX - - -PREFETCH = False -PREFETCH = not PREFETCH # True - -PRECISION = 'double' # DP by default -PRECSUFFIX = 'A64FXd' -if args.single == True: - PRECISION = 'single' - PRECSUFFIX = 'A64FXf' - -_DEBUG = False #True # insert debugging output -if args.debug == True: - _DEBUG = True - -GRIDBENCH = False -if args.gridbench == True: - GRIDBENCH = True - -print("PRECISION = ", PRECISION) -print("DEBUG = ", _DEBUG) -print("ALTERNATIVE_LOADS = ", ALTERNATIVE_LOADS) -print("ALTERNATIVE_REGISTER_MAPPING = ", ALTERNATIVE_REGISTER_MAPPING) -print("MOVPRFX = ", MOVPRFX) -print("DISABLE_ASM_LOAD_STORE = ", DISABLE_ASM_LOAD_STORE) -print("GRIDBENCH = ", GRIDBENCH) - -print("") - -#sys.exit(0) - - -#_DEBUG = True # insert debugging output - -FETCH_BASE_PTR_COLOR_OFFSET = 2 # offset for scalar plus signed immediate addressing -STORE_BASE_PTR_COLOR_OFFSET = 2 - -# 64-bit gp register usage !!! armclang 20.0 complains about the register choice !!! -# table address: x30 -# data address: x29 -# store address: x28 -# debug address: r8 - -# Max performance of complex FMA using FCMLA instruction -# is 25% peak. -# -# Issue latency of FCMLA is 2 cycles. -# Need 2 FCMLA instructions for complex FMA. -# Complete complex FMA takes 4 cycles. -# Peak throughput is 4 * 8 Flops DP = 32 Flops DP in 4 cycles. -# A64FX FMA throughput is 4 * 8 * 2 * 2 = 132 Flops DP in 4 cycles. -# -> 25% peak FMA -# -# In: 3x 512 bits = 192 bytes -# Out: 1x 512 bits = 64 bytes -# Tot: 4x 512 bits = 256 bytes -# -# 256 bytes * 2.2 GHz = 563.2 GB/s (base 10), 524 GB/s (base 2) - -OPT = """ -* interleave prefetching and compute in MULT_2SPIN -* could test storing U's in MULT_2SPIN to L1d for cache line update -* structure reordering: MAYBEPERM after MULT_2SPIN ? -""" - -filename = 'XXX' -LEGAL = """/************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: {} - - Copyright (C) 2020 - -Author: Nils Meyer - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory -*************************************************************************************/ -/* END LEGAL */ -""" - -class Register: - - def __init__(self, variable, asmreg='X', predication=False): - global d - x = 'Y' - if predication == False: - x = asmreg # + d['asmsuffix'] - else: - x = asmreg - self.asmreg = x - self.asmregwithsuffix = asmreg + d['asmsuffix'] - self.asmregbyte = asmreg + '.b' - self.name = variable - self.asmname = variable - self.asmnamebyte = variable + '.b' - self.predication = predication - - d['registers'] += 1 - - def define(self, statement): - global d - d['C'] += F'#define {self.name} {statement}' - #d['A'] += F'#define {self.name} {statement}' - - def declare(self, predication=False): - global d - - if self.predication == False: - d['C'] += F' Simd {self.name}; \\\n' - - predtype = 'svfloat64_t' - if PRECISION == 'single': - predtype = 'svfloat32_t' - - d['I'] += F' {predtype} {self.name}; \\\n' - else: - d['I'] += F' svbool_t {self.name}; \\\n' - #d['A'] += F'#define {self.name} {self.asmreg} \n' - - def loadpredication(self, target='A'): - global d - if (target == 'A'): - d['A'] += F' "ptrue {self.asmregwithsuffix} \\n\\t" \\\n' - d['asmclobber'].append(F'"{self.asmreg}"') - - def loadtable(self, t): - global d - d['load'] += d['factor'] - gpr = d['asmtableptr'] - - cast = 'uint64_t' - #asm_opcode = 'ld1d' - #if PRECISION == 'single': - # asm_opcode = 'ld1w' - # cast = 'uint32_t' - asm_opcode = 'ldr' - if PRECISION == 'single': - asm_opcode = 'ldr' - cast = 'uint32_t' - - d['I'] += F' {self.name} = svld1(pg1, ({cast}*)&lut[{t}]); \\\n' - - # using immediate index break-out works - if asm_opcode == 'ldr': - # ldr version - d['A'] += F' "{asm_opcode} {self.asmreg}, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' - else: - # ld1 version - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' - - d['asminput'].append(F'[tableptr] "r" (&lut[0])') - d['asminput'].append(F'[index] "i" ({t})') - d['asmclobber'].append(F'"memory"') - d['asmclobber'].append(F'"cc"') - - def load(self, address, target='ALL', cast='float64_t', colors=3, offset=FETCH_BASE_PTR_COLOR_OFFSET): - global d - d['load'] += d['factor'] - indices = re.findall(r'\d+', address) - index = (int(indices[0]) - offset) * colors + int(indices[1]) - - #asm_opcode = 'ld1d' - #if PRECISION == 'single': - #asm_opcode = 'ld1w' - # cast = 'float32_t' - - asm_opcode = 'ldr' - if PRECISION == 'single': - asm_opcode = 'ldr' - cast = 'float32_t' - - gpr = d['asmfetchbaseptr'] - intrinfetchbase = d['intrinfetchbase'] - if (target in ['ALL', 'C']): - d['C'] += F' {self.name} = {address}; \\\n' - if (target in ['ALL', 'I']): -# d['I'] += F' {self.name} = svldnt1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' - d['I'] += F' {self.name} = svld1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' - if (target in ['ALL', 'A']): - if asm_opcode == 'ldr': - d['A'] += F' "{asm_opcode} {self.asmreg}, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' - else: - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' - - def store(self, address, cast='float64_t', colors=3, offset=STORE_BASE_PTR_COLOR_OFFSET): - global d - d['store'] += d['factor'] - indices = re.findall(r'\d+', address) - index = (int(indices[0]) - offset) * colors + int(indices[1]) - - #asm_opcode = 'stnt1d' - #if PRECISION == 'single': - # asm_opcode = 'stnt1w' - # cast = 'float32_t' - asm_opcode = 'str' - if PRECISION == 'single': - asm_opcode = 'str' - cast = 'float32_t' - - intrinstorebase = d['intrinstorebase'] - - d['C'] += F' {address} = {self.name}; \\\n' - #d['I'] += F' svstnt1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' - d['I'] += F' svst1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' - if asm_opcode == 'str': - d['A'] += F' "{asm_opcode} {self.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' - else: - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' - - def movestr(self, str): - global d - #d['move'] += d['factor'] - d['I'] += F' {self.name} = {str}; \\\n' - - def move(self, op1): - global d - d['move'] += d['factor'] - d['C'] += F' {self.name} = {op1.name}; \\\n' - d['I'] += F' {self.name} = {op1.name}; \\\n' - d['A'] += F' "mov {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - - # a = a + b , a = b + c - def add(self, op1, op2=None): - global d - d['add'] += d['factor'] - if op2 is None: - d['C'] += F' {self.name} = {self.name} + {op1.name}; \\\n' - d['I'] += F' {self.name} = svadd_x(pg1, {self.name}, {op1.name}); \\\n' - d['A'] += F' "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} + {op2.name}; \\\n' - d['I'] += F' {self.name} = svadd_x(pg1, {op1.name}, {op2.name}); \\\n' - d['A'] += F' "fadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n' - - # a = a -b , a = b - c - def sub(self, op1, op2=None): - global d - d['sub'] += d['factor'] - if op2 is None: - d['C'] += F' {self.name} = {self.name} - {op1.name}; \\\n' - d['I'] += F' {self.name} = svsub_x(pg1, {self.name}, {op1.name}); \\\n' - d['A'] += F' "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} - {op2.name}; \\\n' - d['I'] += F' {self.name} = svsub_x(pg1, {op1.name}, {op2.name}); \\\n' - d['A'] += F' "fsub {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix} \\n\\t" \\\n' - - # a = a * b , a = b * c - def mul(self, op1, op2): - global d - d['mul'] += 2 * d['factor'] - d['C'] += F' {self.name} = {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = __svzero({self.name}); \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "mov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def mul0(self, op1, op2, op3=None, constructive=False): - global d - d['mul'] += d['factor'] - - # no movprfx intrinsics support - if constructive == True: - d['movprfx'] += d['factor'] - d['I'] += F' {self.name} = svcmla_x(pg1, {op1.name}, {op2.name}, {op3.name}, 0); \\\n' - d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op2.asmregwithsuffix}, {op3.asmregwithsuffix}, 0 \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - - def mul1(self, op1, op2): - global d - d['mul'] += d['factor'] - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def mac(self, op1, op2): - global d - d['mac'] += 2 * d['factor'] - d['C'] += F' {self.name} = {self.name} + {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def mac0(self, op1, op2): - global d - d['mac'] += d['factor'] - d['C'] += F' {self.name} = {self.name} + {op1.name} * {op2.name}; \\\n' - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 0); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 0 \\n\\t" \\\n' - - def mac1(self, op1, op2): - global d - d['mac'] += d['factor'] - d['I'] += F' {self.name} = svcmla_x(pg1, {self.name}, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcmla {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def zero(self, zeroreg=False): - d['zero'] += d['factor'] - d['C'] += F' {self.name} = 0; \\\n' - #d['I'] += F' {self.name} = __svzero({self.name}); \\\n' only armclang - - if PRECISION == 'double': - d['I'] += F' {self.name} = svdup_f64(0.); \\\n' - else: - d['I'] += F' {self.name} = svdup_f32(0.); \\\n' - - if zeroreg == True: - d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' - else: - #using mov z, zero0 issue 1c, FLA, latency 6c - #d['A'] += F' "mov {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n' - - #using mov z, 0 issue 1c, FLA, latency 6c - d['A'] += F' "fmov {self.asmregwithsuffix} , 0 \\n\\t" \\\n' - - #using xor z, z, z issue 0.5c, FL*, latency 4c - #d['A'] += F' "eor {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n' - - #using and z, z, zero0 issue 0.5c, FL*, latency 4c - #d['A'] += F' "and {self.asmregwithsuffix}, {self.asmregwithsuffix} , {zero0.asmregwithsuffix} \\n\\t" \\\n' - - #using sub z, z, z issue 0.5c, FL*, latency 9c - #d['A'] += F' "sub {self.asmregwithsuffix}, {self.asmregwithsuffix}, {self.asmregwithsuffix} \\n\\t" \\\n' - - # without table - def timesI(self, op1, tempreg=None, tablereg=None): - global d - d['timesI'] += d['factor'] - d['C'] += F' {self.name} = timesI({op1.name}); \\\n' - # correct if DEBUG enabled, wrong if DEBUG disabled; no idea what's causing this - #table.load('table2', target='I', cast='uint64_t') - #d['I'] += F' {self.name} = svtbl({op1.name}, {tablereg.name}); \\\n' - #d['I'] += F' {self.name} = svneg_x(pg2, {self.name}); \\\n' - # timesI using trn tested, works but tbl should be faster - d['I'] += F' {tempreg.name} = svtrn2({op1.name}, {op1.name}); \\\n' - d['I'] += F' {tempreg.name} = svneg_x(pg1, {tempreg.name}); \\\n' - d['I'] += F' {self.name} = svtrn1({tempreg.name}, {op1.name}); \\\n' - d['A'] += F' "trn2 {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fneg {tempreg.asmregwithsuffix}, {pg1.asmreg}/m, {tempreg.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "trn1 {self.asmregwithsuffix}, {tempreg.asmregwithsuffix}, {op1.asmregwithsuffix} \\n\\t" \\\n' - - def addTimesI(self, op1, op2=None, constructive=False): - global d - d['addTimesI'] += d['factor'] - - if op2 is None: - d['C'] += F' {self.name} = {self.name} + timesI({op1.name}); \\\n' - else: - d['C'] += F' {self.name} = {op1.name} + timesI({op2.name}); \\\n' - - # no movprfx intrinsics support - if constructive == True: - d['movprfx'] += d['factor'] - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - else: - if op2 is None: - d['C'] += F' {self.name} = {self.name} + timesI({op1.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 90); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 90 \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} + timesI({op2.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 90); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 90 \\n\\t" \\\n' - - def subTimesI(self, op1, op2=None, constructive=False): - global d - d['subTimesI'] += d['factor'] - - # no movprfx intrinsics support - if constructive == True: - d['movprfx'] += d['factor'] - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270); \\\n' - d['A'] += F' "movprfx {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix} \\n\\t" \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n' - else: - if op2 is None: - d['C'] += F' {self.name} = {self.name} - timesI({op1.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {self.name}, {op1.name}, 270); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {self.asmregwithsuffix}, {op1.asmregwithsuffix}, 270 \\n\\t" \\\n' - else: - d['C'] += F' {self.name} = {op1.name} - timesI({op2.name}); \\\n' - d['I'] += F' {self.name} = svcadd_x(pg1, {op1.name}, {op2.name}, 270); \\\n' - d['A'] += F' "fcadd {self.asmregwithsuffix}, {pg1.asmreg}/m, {op1.asmregwithsuffix}, {op2.asmregwithsuffix}, 270 \\n\\t" \\\n' - - # timesMinusI is not used, def is probably wrong !!!! OPTIMIZATION with table - def timesMinusI(self, op1): - global d - d['timesMinusI'] += d['factor'] - d['C'] += F' {self.name} = timesMinusI({self.name}); \\\n' - d['I'] += F' {self.name} = svtrn1({op1.name}, {op1.name}); \\\n' - d['I'] += F' {self.name} = svneg_x(pg1, {self.name}); \\\n' - d['I'] += F' {self.name} = svtrn1({op1.name}, {self.name}); \\\n' - - def permute(self, dir, tablereg=None): - global d - d['permutes'] += d['factor'] - - d['C'] += F' permute{dir}({self.name}, {self.name}); \\\n' - - d['I'] += F' {self.name} = svtbl({self.name}, {tablereg.name}); \\\n' - d['A'] += F' "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t" \\\n' - - # if dir == 0: - # d['I'] += F' {self.name} = svext({self.name}, {self.name}, 4); \\\n' - # # this might not work, see intrinsics assembly - # # d['A'] += F' ext {self.name}, {self.name}, {self.name}, #4 \\\n' - # # use registers directly - # d['A'] += F' "ext {self.asmregbyte}, {self.asmregbyte}, {self.asmregbyte}, 32 \\n\\t" \\\n' - # - # elif dir in [1, 2]: - # d['I'] += F' {self.name} = svtbl({self.name}, {tablereg.name}); \\\n' - # d['A'] += F' "tbl {self.asmregwithsuffix}, {{ {self.asmregwithsuffix} }}, {tablereg.asmregwithsuffix} \\n\\t" \\\n' - - def debug(self): - global d - typecast = d['cfloat'] - gpr = d['asmdebugptr'] - vregs = d['asmclobberlist'] - if (d['debug'] == True): - d['C'] += F'std::cout << "{self.name} -- " << {self.name} << std::endl; \\\n' - - d['I'] += F'svst1(pg1, ({typecast}*)&debugreg.v, {self.name}); \\\n' - d['I'] += F'std::cout << "{self.name} -- " << debugreg << std::endl; \\\n' - #d['I'] += F'std::cout << "{self.name} -- " << {self.name} << std::endl; \\\n' - - d['A'] += F'asm ( \\\n' - d['A'] += F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n' # memory barrier - d['A'] += F' "str {self.asmreg}, [%[ptr]] \\n\\t" \\\n' - d['A'] += F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\\n' # memory barrier - d['A'] += F' : "=m" (debugreg.v) \\\n' - d['A'] += F' : [ptr] "r" (&debugreg.v) \\\n' - d['A'] += F' : "p5", "cc", "memory" \\\n' - d['A'] += F'); \\\n' - d['A'] += F'std::cout << "{self.name} -- " << debugreg << std::endl; \\\n' - # this form of addressing is not valid! - #d['A'] += F' "str {self.asmreg}, %[ptr] \\n\\t" \\\n' -# end Register - -def define(s, target='ALL'): - x = F'#define {s} \n' - global d - if (target in ['ALL', 'C']): - d['C'] += x - if (target in ['ALL', 'I']): - d['I'] += x - if (target in ['ALL', 'A']): - d['A'] += x - -def definemultiline(s): - x = F'#define {s} \\\n' - global d - d['C'] += x - d['I'] += x - d['A'] += x - -def write(s, target='ALL'): - x = F'{s}\n' - global d - if (target in ['ALL', 'C']): - d['C'] += x - if (target in ['ALL', 'I']): - d['I'] += x - if (target in ['ALL', 'A']): - d['A'] += x - -def curlyopen(): - write(F'{{ \\') - -def curlyclose(): - write(F'}}') - -def newline(target='ALL'): - global d - - if target == 'A': - if d['A'][-2:] == '\\\n': - d['A'] = d['A'][:-2] + '\n\n' - else: - if d['C'][-2:] == '\\\n': - d['C'] = d['C'][:-2] + '\n\n' - if d['I'][-2:] == '\\\n': - d['I'] = d['I'][:-2] + '\n\n' - if d['A'][-2:] == '\\\n': - d['A'] = d['A'][:-2] + '\n\n' - -# load the base pointer for fetches -def fetch_base_ptr(address, target='A'): - global d - #d['load'] += d['factor'] - - # DEBUG - #colors=3 - #indices = re.findall(r'\d+', address) - #index = (int(indices[0]) - FETCH_BASE_PTR_COLOR_OFFSET) * colors + int(indices[1]) - #print(F'{address} (base)') - - vregs = d['asmclobberlist'] - if target == 'A': - d['asminput'].append(F'[fetchptr] "r" ({address})') - d['asmclobber'].extend(vregs) - d['asmclobber'].append(F'"memory"') - d['asmclobber'].append(F'"cc"') - if target == 'I': - #print("intrinfetchbase = ", address) - d['intrinfetchbase'] = address - -# load the base pointer for stores -def store_base_ptr(address, target='A'): - global d - #d['load'] += d['factor'] - gpr = d['asmstorebaseptr'] - vregs = d['asmclobberlist'] - if target == 'A': - d['asminput'].append(F'[storeptr] "r" ({address})') - d['asmclobber'].extend(vregs) - d['asmclobber'].append(F'"memory"') - d['asmclobber'].append(F'"cc"') - if target == 'I': - d['intrinstorebase'] = address - -def prefetch_L1(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PLDL1STRM" # weak - #policy = "PLDL1KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - -def prefetch_L2(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PLDL2STRM" # weak - #policy = "PLDL2KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - #d['A'] += - -def prefetch_L2_store(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PSTL2STRM" # weak - #policy = "PSTL2KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - -def prefetch_L1_store(address, offset): - global d - multiplier = 4 # offset in CL, have to multiply by 4 - policy = "PSTL1STRM" # weak - #policy = "PSTL2KEEP" # strong - - d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' - d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - - -def asmopen(): - #write('asm volatile ( \\', target='A') - write('asm ( \\', target='A') - - # DEBUG - #write(F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A') # memory barrier - #write('asm volatile ( \\', target='A') - -def asmclose(): - global d - - #print(d['asminput']) - - asmin = d['asminput'] - asmin_s = '' - if len(asmin) > 0: - asmin = list(dict.fromkeys(asmin)) # remove duplicates - #print(asmin) - for el in asmin: - asmin_s += el + ',' - asmin_s = asmin_s[:-1] - #print("-> ", asmin_s) - - d['asminput'] = [] - - asmout = d['asmoutput'] - asmout_s = '' - if len(asmout) > 0: - asmout = list(dict.fromkeys(asmout)) # remove duplicates - for el in asmout: - asmout_s += el + ',' - asmout_s = asmout_s[:-1] - - d['asmoutput'] = [] - - # DEBUG put all regs into clobber by default - d['asmclobber'].extend(d['asmclobberlist']) - - asmclobber = d['asmclobber'] - asmclobber_s = '' - #print(asmclobber) - if len(asmclobber) > 0: - asmclobber = list(dict.fromkeys(asmclobber)) # remove duplicates - for el in asmclobber: - asmclobber_s += el + ',' - asmclobber_s = asmclobber_s[:-1] - - d['asmclobber'] = [] - - # DEBUG - #write(F' " DMB SY \\n\\t " " DSB SY \\n\\t " " ISB SY \\n\\t " \\', target='A') # memory barrier - - - write(F' : {asmout_s} \\', target='A') - write(F' : {asmin_s} \\', target='A') - write(F' : {asmclobber_s} \\', target='A') - write('); \\', target='A') - -# -------------------------------------------------------------------------------- - -# string of vector registers to be used in clobber list -#clobberlist = ['"p0"'] -clobberlist = ['"p5"'] -clobberlist.append('"cc"') -for i in range(0, 32): - clobberlist.append(F'"z{i}"') - -d = { -'debug': _DEBUG, -'C': '', -'I': '', -'A': '', -'asmsuffix': '.d', # double precision by default -'cfloat': 'float64_t', -'registers': 0, -'load': 0, -'store': 0, -'move': 0, -'movprfx': 0, -'zero': 0, -'add': 0, -'sub': 0, -'mul': 0, -'mac': 0, -'permutes': 0, -'neg': 0, -'addTimesI': 0, -'subTimesI': 0, -'timesI': 0, -'timesMinusI': 0, -'flops': 0, -'factor': 1, # multiplicity -'asmtableptr': 'x30', -'asmfetchbaseptr': 'x29', -'asmstorebaseptr': 'x28', -'asmdebugptr': 'r12', -'asminput': [], -'asmoutput': [], -'asmclobber': [], -'asmclobberlist': clobberlist, -'intrinfetchbase': '', -'intrinstorebase': '', -'cycles_LOAD_CHIMU': 0, -'cycles_PROJ': 0, -'cycles_PERM': 0, -'cycles_MULT_2SPIN': 0, -'cycles_RECON': 0, -'cycles_RESULT': 0, -'cycles_ZERO_PSI': 0, -'cycles_PREFETCH_L1': 0, -'cycles_PREFETCH_L2': 0 -} - -if PRECISION == 'single': - d['asmsuffix'] = '.s' - d['cfloat'] = 'float32_t' - -# -------------------------------------------------------------------------------- -# Grid -# -------------------------------------------------------------------------------- - -# Variables / Registers -result_00 = Register('result_00', asmreg='z0') -result_01 = Register('result_01', asmreg='z1') -result_02 = Register('result_02', asmreg='z2') -result_10 = Register('result_10', asmreg='z3') -result_11 = Register('result_11', asmreg='z4') -result_12 = Register('result_12', asmreg='z5') -result_20 = Register('result_20', asmreg='z6') -result_21 = Register('result_21', asmreg='z7') -result_22 = Register('result_22', asmreg='z8') -result_30 = Register('result_30', asmreg='z9') -result_31 = Register('result_31', asmreg='z10') -result_32 = Register('result_32', asmreg='z11') # 12 Regs -Chi_00 = Register('Chi_00', asmreg='z12') -Chi_01 = Register('Chi_01', asmreg='z13') -Chi_02 = Register('Chi_02', asmreg='z14') -Chi_10 = Register('Chi_10', asmreg='z15') -Chi_11 = Register('Chi_11', asmreg='z16') -Chi_12 = Register('Chi_12', asmreg='z17') # 6 -UChi_00 = Register('UChi_00', asmreg='z18') -UChi_01 = Register('UChi_01', asmreg='z19') -UChi_02 = Register('UChi_02', asmreg='z20') -UChi_10 = Register('UChi_10', asmreg='z21') -UChi_11 = Register('UChi_11', asmreg='z22') -UChi_12 = Register('UChi_12', asmreg='z23') # 6 -U_00 = Register('U_00', asmreg='z24') -U_10 = Register('U_10', asmreg='z25') -U_20 = Register('U_20', asmreg='z26') -U_01 = Register('U_01', asmreg='z27') -U_11 = Register('U_11', asmreg='z28') -U_21 = Register('U_21', asmreg='z29') # 6 -> 30 Registers - -table0 = Register('table0', asmreg='z30') -zero0 = Register('zero0', asmreg='z31') # 2 -> 32 Registers -# can't overload temp1 / table due to type mismatch using intrinsics :( -# typecasting SVE intrinsics variables is not allowed - -pg1 = Register('pg1', predication=True, asmreg='p5') -#pg2 = Register('pg2', predication=True, asmreg='p1') - -# Overloaded with Chi_* and UChi_* -Chimu_00 = Register('Chimu_00', asmreg=Chi_00.asmreg) -Chimu_01 = Register('Chimu_01', asmreg=Chi_01.asmreg) -Chimu_02 = Register('Chimu_02', asmreg=Chi_02.asmreg) -Chimu_10 = Register('Chimu_10', asmreg=Chi_10.asmreg) -Chimu_11 = Register('Chimu_11', asmreg=Chi_11.asmreg) -Chimu_12 = Register('Chimu_12', asmreg=Chi_12.asmreg) -if ALTERNATIVE_REGISTER_MAPPING == False: - Chimu_20 = Register('Chimu_20', asmreg=UChi_00.asmreg) - Chimu_21 = Register('Chimu_21', asmreg=UChi_01.asmreg) - Chimu_22 = Register('Chimu_22', asmreg=UChi_02.asmreg) - Chimu_30 = Register('Chimu_30', asmreg=UChi_10.asmreg) - Chimu_31 = Register('Chimu_31', asmreg=UChi_11.asmreg) - Chimu_32 = Register('Chimu_32', asmreg=UChi_12.asmreg) # 12 Registers -else: # wilson4.h - Chimu_20 = Register('Chimu_20', asmreg=U_00.asmreg) - Chimu_21 = Register('Chimu_21', asmreg=U_10.asmreg) - Chimu_22 = Register('Chimu_22', asmreg=U_20.asmreg) - Chimu_30 = Register('Chimu_30', asmreg=U_01.asmreg) - Chimu_31 = Register('Chimu_31', asmreg=U_11.asmreg) - Chimu_32 = Register('Chimu_32', asmreg=U_21.asmreg) - -# debugging output -def debugall(msg=None, group='ALL'): - global d - if (d['debug'] == False): - return - write(F'std::cout << std::endl << "DEBUG -- {msg}" << std::endl; \\') - if (group in ['ALL', 'result']): - result_00.debug() - result_01.debug() - result_02.debug() - result_10.debug() - result_11.debug() - result_12.debug() - result_20.debug() - result_21.debug() - result_22.debug() - result_30.debug() - result_31.debug() - result_32.debug() - if (group in ['ALL', 'Chi']): - Chi_00.debug() - Chi_01.debug() - Chi_02.debug() - Chi_10.debug() - Chi_11.debug() - Chi_12.debug() - if (group in ['ALL', 'UChi']): - UChi_00.debug() - UChi_01.debug() - UChi_02.debug() - UChi_10.debug() - UChi_11.debug() - UChi_12.debug() - if (group in ['ALL', 'U']): - U_00.debug() - U_10.debug() - U_20.debug() - U_01.debug() - U_11.debug() - U_21.debug() - if (group in ['ALL', 'Chimu']): - Chimu_00.debug() - Chimu_01.debug() - Chimu_02.debug() - Chimu_10.debug() - Chimu_11.debug() - Chimu_12.debug() - Chimu_20.debug() - Chimu_21.debug() - Chimu_22.debug() - Chimu_30.debug() - Chimu_31.debug() - Chimu_32.debug() - -# -------------------------------------------------------------------------------- -# Output -# -------------------------------------------------------------------------------- - -if ALTERNATIVE_LOADS == True: - define(F'LOAD_CHIMU_0213_PLUG LOAD_CHIMU_0213_{PRECSUFFIX}') - define(F'LOAD_CHIMU_0312_PLUG LOAD_CHIMU_0312_{PRECSUFFIX}') - define(F'LOAD_CHIMU(x)') -else: - #define(F'LOAD_CHIMU_{PRECSUFFIX}(x) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(x)') - define(F'LOAD_CHIMU(base) LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') - -if PREFETCH: - define(F'PREFETCH_CHIMU_L1(A) PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_GAUGE_L1(A) PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_CHIMU_L2(A) PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') - define(F'PF_GAUGE(A)') - define(F'PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(A)') - define(F'PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)') -# define(F'PREFETCH1_CHIMU(A)') - define(F'PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)') -# define(F'PREFETCH_CHIMU(A)') -else: - define(F'PREFETCH_CHIMU_L1(A)') - define(F'PREFETCH_GAUGE_L1(A)') - define(F'PREFETCH_CHIMU_L2(A)') - define(F'PREFETCH_GAUGE_L2(A)') - define(F'PF_GAUGE(A)') - define(F'PREFETCH1_CHIMU(A)') - define(F'PREFETCH_CHIMU(A)') - define(F'PREFETCH_RESULT_L2_STORE(A)') - -# standard defines -define(F'LOCK_GAUGE(A)') -define(F'UNLOCK_GAUGE(A)') -define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}') -define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B)') -define(F'MULT_2SPIN_1(Dir) MULT_2SPIN_1_{PRECSUFFIX}(Dir)') -define(F'MULT_2SPIN_2 MULT_2SPIN_2_{PRECSUFFIX}') -define(F'LOAD_CHI(base) LOAD_CHI_{PRECSUFFIX}(base)') -# don't need zero psi, everything is done in recons -#define(F'ZERO_PSI ZERO_PSI_{PRECSUFFIX}') -define(F'ADD_RESULT(base,basep) LOAD_CHIMU(base); ADD_RESULT_INTERNAL_{PRECSUFFIX}; RESULT_{PRECSUFFIX}(base)') -# loads projections -define(F'XP_PROJ XP_PROJ_{PRECSUFFIX}') -define(F'YP_PROJ YP_PROJ_{PRECSUFFIX}') -define(F'ZP_PROJ ZP_PROJ_{PRECSUFFIX}') -define(F'TP_PROJ TP_PROJ_{PRECSUFFIX}') -define(F'XM_PROJ XM_PROJ_{PRECSUFFIX}') -define(F'YM_PROJ YM_PROJ_{PRECSUFFIX}') -define(F'ZM_PROJ ZM_PROJ_{PRECSUFFIX}') -define(F'TM_PROJ TM_PROJ_{PRECSUFFIX}') -# recons -define(F'XP_RECON XP_RECON_{PRECSUFFIX}') -define(F'XM_RECON XM_RECON_{PRECSUFFIX}') -define(F'XM_RECON_ACCUM XM_RECON_ACCUM_{PRECSUFFIX}') -define(F'YM_RECON_ACCUM YM_RECON_ACCUM_{PRECSUFFIX}') -define(F'ZM_RECON_ACCUM ZM_RECON_ACCUM_{PRECSUFFIX}') -define(F'TM_RECON_ACCUM TM_RECON_ACCUM_{PRECSUFFIX}') -define(F'XP_RECON_ACCUM XP_RECON_ACCUM_{PRECSUFFIX}') -define(F'YP_RECON_ACCUM YP_RECON_ACCUM_{PRECSUFFIX}') -define(F'ZP_RECON_ACCUM ZP_RECON_ACCUM_{PRECSUFFIX}') -define(F'TP_RECON_ACCUM TP_RECON_ACCUM_{PRECSUFFIX}') -# new permutes -define(F'PERMUTE_DIR0 0') -define(F'PERMUTE_DIR1 1') -define(F'PERMUTE_DIR2 2') -define(F'PERMUTE_DIR3 3') -define(F'PERMUTE PERMUTE_{PRECSUFFIX};') -# load table -#define(F'MAYBEPERM(A,perm) if (perm) {{ A ; }}') -if PRECISION == 'double': - define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1; }} else if (Dir == 2) {{ LOAD_TABLE2; }}') - define(F'MAYBEPERM(Dir,perm) if (Dir != 3) {{ if (perm) {{ PERMUTE; }} }}') -else: - define(F'LOAD_TABLE(Dir) if (Dir == 0) {{ LOAD_TABLE0; }} else if (Dir == 1) {{ LOAD_TABLE1 }} else if (Dir == 2) {{ LOAD_TABLE2; }} else if (Dir == 3) {{ LOAD_TABLE3; }}') - define(F'MAYBEPERM(A,perm) if (perm) {{ PERMUTE; }}') - - - -write('// DECLARATIONS') -definemultiline(F'DECLARATIONS_{PRECSUFFIX}') -# debugging register -if d['debug'] == True: - write(' Simd debugreg; \\') -# perm tables -if PRECISION == 'double': - write(' const uint64_t lut[4][8] = { \\') - write(' {4, 5, 6, 7, 0, 1, 2, 3}, \\') #0 = swap register halves - write(' {2, 3, 0, 1, 6, 7, 4, 5}, \\') #1 = swap halves of halves - write(' {1, 0, 3, 2, 5, 4, 7, 6}, \\') #2 = swap re/im - write(' {0, 1, 2, 4, 5, 6, 7, 8} };\\') #3 = identity -else: - write(' const uint32_t lut[4][16] = { \\') - write(' {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, \\') #0 = swap register halves - write(' {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}, \\') #1 = swap halves of halves - write(' {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}, \\') #2 = swap halves of halves of halves - write(' {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14} }; \\') #3 = swap re/im - -#newline(target='A') -result_00.declare() -result_01.declare() -result_02.declare() -result_10.declare() -result_11.declare() -result_12.declare() -result_20.declare() -result_21.declare() -result_22.declare() -result_30.declare() -result_31.declare() -result_32.declare() # 12 -Chi_00.declare() -Chi_01.declare() -Chi_02.declare() -Chi_10.declare() -Chi_11.declare() -Chi_12.declare() # 6 -UChi_00.declare() -UChi_01.declare() -UChi_02.declare() -UChi_10.declare() -UChi_11.declare() -UChi_12.declare() # 6 -U_00.declare() -U_10.declare() -U_20.declare() -U_01.declare() -U_11.declare() -U_21.declare() # 6 -> 30 regs - -# all predications true -pg1.declare() -if PRECISION == 'double': - pg1.movestr('svptrue_b64()') -else: - pg1.movestr('svptrue_b32()') - -# tables -if PRECISION == 'double': - write(' svuint64_t table0; \\', target='I') # -> 31 regs -else: - write(' svuint32_t table0; \\', target='I') # -> 31 regs - -zero0.declare() - -# zero register -asmopen() -zero0.zero(zeroreg=True) -asmclose() -newline() - -define('Chimu_00 Chi_00', target='I') -define('Chimu_01 Chi_01', target='I') -define('Chimu_02 Chi_02', target='I') -define('Chimu_10 Chi_10', target='I') -define('Chimu_11 Chi_11', target='I') -define('Chimu_12 Chi_12', target='I') -if ALTERNATIVE_REGISTER_MAPPING == False: - define('Chimu_20 UChi_00', target='I') - define('Chimu_21 UChi_01', target='I') - define('Chimu_22 UChi_02', target='I') - define('Chimu_30 UChi_10', target='I') - define('Chimu_31 UChi_11', target='I') - define('Chimu_32 UChi_12', target='I') -else: # wilson4.h - define('Chimu_20 U_00', target='I') - define('Chimu_21 U_10', target='I') - define('Chimu_22 U_20', target='I') - define('Chimu_30 U_01', target='I') - define('Chimu_31 U_11', target='I') - define('Chimu_32 U_21', target='I') -newline() - - -d['cycles_RESULT'] += 12 -write('// RESULT') -definemultiline(F'RESULT_{PRECSUFFIX}(base)') -if ASM_STORE: - curlyopen() - #write(' SiteSpinor & ref(out[ss]); \\') - asmopen() - #pg1.loadpredication() - #store_base_ptr("&ref[0][0]") - #store_base_ptr(F"&ref[{STORE_BASE_PTR_COLOR_OFFSET}][0]") - store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') - store_base_ptr(F"base + {STORE_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - result_00.store("ref[0][0]") - result_01.store("ref[0][1]") - result_02.store("ref[0][2]") - result_10.store("ref[1][0]") - result_11.store("ref[1][1]") - result_12.store("ref[1][2]") - result_20.store("ref[2][0]") - result_21.store("ref[2][1]") - result_22.store("ref[2][2]") - result_30.store("ref[3][0]") - result_31.store("ref[3][1]") - result_32.store("ref[3][2]") - asmclose() - debugall('RESULT', group='result') - curlyclose() -newline() - -# prefetch spinors from memory into L2 cache -d['factor'] = 0 -d['cycles_PREFETCH_L2'] += 0 * d['factor'] -write('// PREFETCH_CHIMU_L2 (prefetch to L2)') -definemultiline(F'PREFETCH_CHIMU_L2_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -#pg1.loadpredication() -#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") -fetch_base_ptr(F"base", target='A') -prefetch_L2(F"base", 0) -prefetch_L2(F"base", 1) -prefetch_L2(F"base", 2) -asmclose() -curlyclose() -newline() - -# prefetch spinors from memory into L1 cache -d['factor'] = 0 -d['cycles_PREFETCH_L1'] += 0 * d['factor'] -write('// PREFETCH_CHIMU_L1 (prefetch to L1)') -definemultiline(F'PREFETCH_CHIMU_L1_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -#pg1.loadpredication() -fetch_base_ptr(F"base", target='A') -prefetch_L1(F"base", 0) -prefetch_L1(F"base", 1) -prefetch_L1(F"base", 2) -asmclose() -curlyclose() -newline() - -# prefetch gauge from memory into L2 cache -d['factor'] = 0 -d['cycles_PREFETCH_L2'] += 0 * d['factor'] -write('// PREFETCH_GAUGE_L2 (prefetch to L2)') -definemultiline(F'PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') -curlyopen() -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sUn][A]); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\') -else: - write(' const auto & ref(U[sUn](A)); uint64_t baseU = (uint64_t)&ref + 3 * 3 * 64; \\') -asmopen() -#pg1.loadpredication() -#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") -fetch_base_ptr(F"baseU", target='A') -prefetch_L2(F"baseU", -1) -prefetch_L2(F"baseU", 0) -prefetch_L2(F"baseU", 1) -prefetch_L2(F"baseU", 2) -prefetch_L2(F"baseU", 3) -prefetch_L2(F"baseU", 4) -prefetch_L2(F"baseU", 5) -prefetch_L2(F"baseU", 6) -prefetch_L2(F"baseU", 7) -#prefetch_L2(F"baseU", 8) -asmclose() -curlyclose() -newline() - -# prefetch gauge from memory into L1 cache -d['factor'] = 0 -d['cycles_PREFETCH_L1'] += 0 * d['factor'] -write('// PREFETCH_GAUGE_L1 (prefetch to L1)') -definemultiline(F'PREFETCH_GAUGE_L1_INTERNAL_{PRECSUFFIX}(A)') -curlyopen() -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') -else: - write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') -asmopen() -#pg1.loadpredication() -#fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") -fetch_base_ptr(F"baseU", target='A') -prefetch_L1(F"baseU", 0) -prefetch_L1(F"baseU", 1) -prefetch_L1(F"baseU", 2) -asmclose() -curlyclose() -newline() - -d['factor'] = 0 -write('// LOAD_CHI') -definemultiline(F'LOAD_CHI_{PRECSUFFIX}(base)') -if ASM_LOAD_CHIMU: - curlyopen() - #write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') - #fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - fetch_base_ptr(F"base", target='I') - fetch_base_ptr(F"base", target='A') - - Chi_00.load("ref[0][0]", offset=0) - Chi_01.load("ref[0][1]", offset=0) - Chi_02.load("ref[0][2]", offset=0) - Chi_10.load("ref[1][0]", offset=0) - Chi_11.load("ref[1][1]", offset=0) - Chi_12.load("ref[1][2]", offset=0) - asmclose() - debugall('LOAD_CHI', group='Chi') - curlyclose() -newline() - - - -d['factor'] = 8 -# 12 loads = 12 issues, load latency = 8+1 cycles -# (not perfectly clear to me from docs) -d['cycles_LOAD_CHIMU'] += 11 * d['factor'] -write('// LOAD_CHIMU') -definemultiline(F'LOAD_CHIMU_INTERLEAVED_{PRECSUFFIX}(base)') -if ASM_LOAD_CHIMU: - curlyopen() - #write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - pg1.loadpredication() - #fetch_base_ptr("&ref[0][0]") - #fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") - fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') - fetch_base_ptr(F"base + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - # Chimu_00.load("ref[0][0]") - # Chimu_01.load("ref[0][1]") - # Chimu_02.load("ref[0][2]") - # Chimu_10.load("ref[1][0]") - # Chimu_11.load("ref[1][1]") - # Chimu_12.load("ref[1][2]") - # Chimu_20.load("ref[2][0]") - # Chimu_21.load("ref[2][1]") - # Chimu_22.load("ref[2][2]") - # Chimu_30.load("ref[3][0]") - # Chimu_31.load("ref[3][1]") - # Chimu_32.load("ref[3][2]") - - Chimu_00.load("ref[0][0]") # minimum penalty for all directions - Chimu_30.load("ref[3][0]") - Chimu_10.load("ref[1][0]") - Chimu_20.load("ref[2][0]") - - Chimu_01.load("ref[0][1]") - Chimu_31.load("ref[3][1]") - Chimu_11.load("ref[1][1]") - Chimu_21.load("ref[2][1]") - - Chimu_02.load("ref[0][2]") - Chimu_32.load("ref[3][2]") - Chimu_12.load("ref[1][2]") - Chimu_22.load("ref[2][2]") - asmclose() - debugall('LOAD_CHIMU', group='Chimu') - curlyclose() -newline() - -# alternative load chimu: dirac order 0213 -# placed into asm (...) -d['factor'] = 0 -d['cycles_LOAD_CHIMU'] += 11 * d['factor'] -write('// LOAD_CHIMU_0213') -definemultiline(F'LOAD_CHIMU_0213_{PRECSUFFIX}') -if ASM_LOAD_CHIMU: - curlyopen() - write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - pg1.loadpredication() - fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") - Chimu_00.load("ref[0][0]") # reordered - Chimu_20.load("ref[2][0]") - - Chimu_01.load("ref[0][1]") - Chimu_21.load("ref[2][1]") - - Chimu_02.load("ref[0][2]") - Chimu_22.load("ref[2][2]") - - Chimu_10.load("ref[1][0]") - Chimu_30.load("ref[3][0]") - - Chimu_11.load("ref[1][1]") - Chimu_31.load("ref[3][1]") - - Chimu_12.load("ref[1][2]") - Chimu_32.load("ref[3][2]") - asmclose() - debugall('LOAD_CHIMU_0213', group='Chimu') - curlyclose() -newline() - -# alternative load chimu: dirac order 0312 -# placed into asm (...) -d['factor'] = 0 -d['cycles_LOAD_CHIMU'] += 11 * d['factor'] -write('// LOAD_CHIMU_0312') -definemultiline(F'LOAD_CHIMU_0312_{PRECSUFFIX}') -if ASM_LOAD_CHIMU: - curlyopen() - write(' const SiteSpinor & ref(in[offset]); \\') - asmopen() - pg1.loadpredication() - fetch_base_ptr(F"&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]") - Chimu_00.load("ref[0][0]") # reordered - Chimu_30.load("ref[3][0]") - - Chimu_01.load("ref[0][1]") - Chimu_31.load("ref[3][1]") - - Chimu_02.load("ref[0][2]") - Chimu_32.load("ref[3][2]") - - Chimu_10.load("ref[1][0]") - Chimu_20.load("ref[2][0]") - - Chimu_11.load("ref[1][1]") - Chimu_21.load("ref[2][1]") - - Chimu_12.load("ref[1][2]") - Chimu_22.load("ref[2][2]") - asmclose() - debugall('LOAD_CHIMU_0312', group='Chimu') - curlyclose() -newline() - -d['factor'] = 2 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE0') -definemultiline(F'LOAD_TABLE0') -asmopen() -table0.loadtable(0) -asmclose() -newline() - -d['factor'] = 2 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE1') -definemultiline(F'LOAD_TABLE1') -asmopen() -table0.loadtable(1) -asmclose() -newline() - -d['factor'] = 2 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE2') -definemultiline(F'LOAD_TABLE2') -asmopen() -table0.loadtable(2) -asmclose() -newline() - -d['factor'] = 0 -d['cycles_PERM'] += 1 * d['factor'] -write('// LOAD_TABLE3') -definemultiline(F'LOAD_TABLE3') -asmopen() -table0.loadtable(3) -asmclose() -newline() - -d['factor'] = 2 # factor is 2 -d['cycles_PERM'] += 6 * d['factor'] -write('// PERMUTE') -definemultiline(F'PERMUTE_{PRECSUFFIX}') -debugall('PERM PRE', group='Chi') -asmopen() -#table0.loadtable(2) -Chi_00.permute(2, table0) -Chi_01.permute(2, table0) -Chi_02.permute(2, table0) -Chi_10.permute(2, table0) -Chi_11.permute(2, table0) -Chi_12.permute(2, table0) -asmclose() -debugall('PERM POST', group='Chi') -newline() - -write('// LOAD_GAUGE') -definemultiline(F'LOAD_GAUGE') -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') -else: - write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') -curlyopen() -asmopen() -pg1.loadpredication() -fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') -if ASM_LOAD_GAUGE: - fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') - U_00.load("ref[0][0]") - U_10.load("ref[1][0]") - U_20.load("ref[2][0]") - U_01.load("ref[0][1]") - U_11.load("ref[1][1]") - U_21.load("ref[2][1]") -asmclose() -curlyclose() -newline() - -d['factor'] = 8 # MULT_2SPIN executes 1 time per direction = 8 times total -# assume all U loads are hidden -# FCMLA issue latency = 2 cycles -# measurement: latency = 16 cycles if FULLY pipelined !? -# spec says 6+6+9 cycles -# 6 rounds of FCMLA, each with 6 FCMLA -> 21 - 6*2 = 9 -d['cycles_MULT_2SPIN'] += 6 * 21 * d['factor'] -write('// MULT_2SPIN') -definemultiline(F'MULT_2SPIN_1_{PRECSUFFIX}(A)') -curlyopen() -#write(' const auto & ref(U[sU][A]); \\') -if GRIDBENCH: # referencing differs in Grid and GridBench - write(' const auto & ref(U[sU][A]); uint64_t baseU = (uint64_t)&ref; \\') -else: - write(' const auto & ref(U[sU](A)); uint64_t baseU = (uint64_t)&ref; \\') -asmopen() -#pg1.loadpredication() -#fetch_base_ptr("&ref[0][0]") -fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='I') -fetch_base_ptr(F"baseU + {FETCH_BASE_PTR_COLOR_OFFSET} * 3 * 64", target='A') -#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='I') -#fetch_base_ptr(F"(uint64_t)&ref[{FETCH_BASE_PTR_COLOR_OFFSET}][0]", target='A') -#fetch_base_ptr(F"&ref[0][{FETCH_BASE_PTR_COLOR_OFFSET}]") -if ASM_LOAD_GAUGE: - U_00.load("ref[0][0]") - U_10.load("ref[1][0]") - U_20.load("ref[2][0]") - U_01.load("ref[0][1]") - U_11.load("ref[1][1]") - U_21.load("ref[2][1]") - -if MOVPRFX == False: - UChi_00.zero() # implementation specific - UChi_10.zero() - UChi_01.zero() - UChi_11.zero() - UChi_02.zero() - UChi_12.zero() - - # round 1 - UChi_00.mul0(U_00, Chi_00) # FCMLA latency is 6+6+9 cycles - UChi_10.mul0(U_00, Chi_10) - UChi_01.mul0(U_10, Chi_00) - UChi_11.mul0(U_10, Chi_10) - UChi_02.mul0(U_20, Chi_00) - UChi_12.mul0(U_20, Chi_10) -else: - # round 1 - UChi_00.mul0(zero0, U_00, Chi_00, constructive=True) # FCMLA latency is 6+6+9 cycles - UChi_10.mul0(zero0, U_00, Chi_10, constructive=True) - UChi_01.mul0(zero0, U_10, Chi_00, constructive=True) - UChi_11.mul0(zero0, U_10, Chi_10, constructive=True) - UChi_02.mul0(zero0, U_20, Chi_00, constructive=True) - UChi_12.mul0(zero0, U_20, Chi_10, constructive=True) - -# round 2 -UChi_00.mul1(U_00, Chi_00) -UChi_10.mul1(U_00, Chi_10) -UChi_01.mul1(U_10, Chi_00) -UChi_11.mul1(U_10, Chi_10) -UChi_02.mul1(U_20, Chi_00) -UChi_12.mul1(U_20, Chi_10) # Chi_00 and Chi_10 available from here - -if ASM_LOAD_GAUGE: - U_00.load("ref[0][2]") # U_00, U_10, U_20 overloaded - U_10.load("ref[1][2]") # early load - U_20.load("ref[2][2]") # A --> -asmclose() -debugall('MULT_2SPIN_1', group='UChi') -curlyclose() -newline() - -write('// MULT_2SPIN_BACKEND') -definemultiline(F'MULT_2SPIN_2_{PRECSUFFIX}') -curlyopen() -asmopen() -# round 3 -UChi_00.mac0(U_01, Chi_01) # armclang separates fcmla(..., 0) and -UChi_10.mac0(U_01, Chi_11) # fcmla(..., 90) -UChi_01.mac0(U_11, Chi_01) # autonomously using intrinsics -UChi_11.mac0(U_11, Chi_11) -UChi_02.mac0(U_21, Chi_01) -UChi_12.mac0(U_21, Chi_11) -# round 4 -UChi_00.mac1(U_01, Chi_01) -UChi_10.mac1(U_01, Chi_11) -UChi_01.mac1(U_11, Chi_01) -UChi_11.mac1(U_11, Chi_11) -UChi_02.mac1(U_21, Chi_01) -UChi_12.mac1(U_21, Chi_11) -# round 5 -UChi_00.mac0(U_00, Chi_02) # <-- A -UChi_10.mac0(U_00, Chi_12) -UChi_01.mac0(U_10, Chi_02) -UChi_11.mac0(U_10, Chi_12) -UChi_02.mac0(U_20, Chi_02) -UChi_12.mac0(U_20, Chi_12) -# round 6 -UChi_00.mac1(U_00, Chi_02) -UChi_10.mac1(U_00, Chi_12) -UChi_01.mac1(U_10, Chi_02) -UChi_11.mac1(U_10, Chi_12) -UChi_02.mac1(U_20, Chi_02) -UChi_12.mac1(U_20, Chi_12) -asmclose() -debugall('MULT_2SPIN_2', group='UChi') -curlyclose() -newline() - - -#// hspin(0)=fspin(0)+timesI(fspin(3)); -#// hspin(1)=fspin(1)+timesI(fspin(2)); -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// XP_PROJ') -definemultiline(F'XP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.addTimesI(Chimu_00, Chimu_30) -Chi_01.addTimesI(Chimu_01, Chimu_31) -Chi_02.addTimesI(Chimu_02, Chimu_32) -Chi_10.addTimesI(Chimu_10, Chimu_20) -Chi_11.addTimesI(Chimu_11, Chimu_21) -Chi_12.addTimesI(Chimu_12, Chimu_22) -asmclose() -debugall('XP_PROJ', group='Chi') -curlyclose() -newline() - -#// fspin(0)=hspin(0); -#// fspin(1)=hspin(1); -#// fspin(2)=timesMinusI(hspin(1)); -#// fspin(3)=timesMinusI(hspin(0)); -# does not occur in GridBench -d['factor'] = 0 -d['cycles_RECON'] += 15 * d['factor'] -write('// XP_RECON') -definemultiline(F'XP_RECON_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -if MOVPRFX == False: - result_20.zero() - result_21.zero() - result_22.zero() - result_30.zero() - result_31.zero() - result_32.zero() - - result_20.subTimesI(UChi_10) - result_21.subTimesI(UChi_11) - result_22.subTimesI(UChi_12) - result_30.subTimesI(UChi_00) - result_31.subTimesI(UChi_01) - result_32.subTimesI(UChi_02) -else: - result_20.subTimesI(zero0, UChi_10, constructive=True) - result_21.subTimesI(zero0, UChi_11, constructive=True) - result_22.subTimesI(zero0, UChi_12, constructive=True) - result_30.subTimesI(zero0, UChi_00, constructive=True) - result_31.subTimesI(zero0, UChi_01, constructive=True) - result_32.subTimesI(zero0, UChi_02, constructive=True) - -result_00.move(UChi_00) # don't reorder ! -result_01.move(UChi_01) -result_02.move(UChi_02) -result_10.move(UChi_10) -result_11.move(UChi_11) -result_12.move(UChi_12) - -# result_00.add(UChi_00) # faster than move? -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -asmclose() -debugall('XP_RECON', group='result') -newline() - - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_RECON'] += 15 * d['factor'] -write('// XP_RECON_ACCUM') -definemultiline(F'XP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_20.subTimesI(UChi_10) -# result_21.subTimesI(UChi_11) -# result_22.subTimesI(UChi_12) -# result_30.subTimesI(UChi_00) -# result_31.subTimesI(UChi_01) -# result_32.subTimesI(UChi_02) -# -# result_00.add(UChi_00) # reordered -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) - -result_30.subTimesI(UChi_00) # reordered -result_00.add(UChi_00) - -result_31.subTimesI(UChi_01) -result_01.add(UChi_01) - -result_32.subTimesI(UChi_02) -result_02.add(UChi_02) - -result_20.subTimesI(UChi_10) -result_10.add(UChi_10) - -result_21.subTimesI(UChi_11) -result_11.add(UChi_11) - -result_22.subTimesI(UChi_12) -result_12.add(UChi_12) -asmclose() -debugall('XP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// YP_PROJ') -definemultiline(F'YP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.sub(Chimu_00, Chimu_30) -Chi_01.sub(Chimu_01, Chimu_31) -Chi_02.sub(Chimu_02, Chimu_32) -Chi_10.add(Chimu_10, Chimu_20) -Chi_11.add(Chimu_11, Chimu_21) -Chi_12.add(Chimu_12, Chimu_22) -asmclose() -debugall('YP_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// ZP_PROJ') -definemultiline(F'ZP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.addTimesI(Chimu_00, Chimu_20) -Chi_01.addTimesI(Chimu_01, Chimu_21) -Chi_02.addTimesI(Chimu_02, Chimu_22) -Chi_10.subTimesI(Chimu_10, Chimu_30) -Chi_11.subTimesI(Chimu_11, Chimu_31) -Chi_12.subTimesI(Chimu_12, Chimu_32) -asmclose() -debugall('ZP_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// TP_PROJ') -definemultiline(F'TP_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.add(Chimu_00, Chimu_20) -Chi_01.add(Chimu_01, Chimu_21) -Chi_02.add(Chimu_02, Chimu_22) -Chi_10.add(Chimu_10, Chimu_30) -Chi_11.add(Chimu_11, Chimu_31) -Chi_12.add(Chimu_12, Chimu_32) -asmclose() -debugall('TP_PROJ', group='Chi') -curlyclose() -newline() - -#// hspin(0)=fspin(0)-timesI(fspin(3)); -#// hspin(1)=fspin(1)-timesI(fspin(2)); - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// XM_PROJ') -definemultiline(F'XM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.subTimesI(Chimu_00, Chimu_30) -Chi_01.subTimesI(Chimu_01, Chimu_31) -Chi_02.subTimesI(Chimu_02, Chimu_32) -Chi_10.subTimesI(Chimu_10, Chimu_20) -Chi_11.subTimesI(Chimu_11, Chimu_21) -Chi_12.subTimesI(Chimu_12, Chimu_22) -asmclose() -debugall('XM_PROJ sub', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 15 * d['factor'] -write('// XM_RECON') -definemultiline(F'XM_RECON_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() - -# only necessary if not zeroed before -if MOVPRFX == False: - result_20.zero() - result_21.zero() - result_22.zero() - result_30.zero() - result_31.zero() - result_32.zero() - - result_20.addTimesI(UChi_10) # <-- - result_21.addTimesI(UChi_11) - result_22.addTimesI(UChi_12) - result_30.addTimesI(UChi_00) - result_31.addTimesI(UChi_01) - result_32.addTimesI(UChi_02) -else: - result_20.addTimesI(zero0, UChi_10, constructive=True) # <-- - result_21.addTimesI(zero0, UChi_11, constructive=True) - result_22.addTimesI(zero0, UChi_12, constructive=True) - result_30.addTimesI(zero0, UChi_00, constructive=True) - result_31.addTimesI(zero0, UChi_01, constructive=True) - result_32.addTimesI(zero0, UChi_02, constructive=True) - -result_00.move(UChi_00) -result_01.move(UChi_01) -result_02.move(UChi_02) -result_10.move(UChi_10) -result_11.move(UChi_11) -result_12.move(UChi_12) -asmclose() -debugall('XM_RECON result', group='result') -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// YM_PROJ') -definemultiline(F'YM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0312_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.add(Chimu_00, Chimu_30) -Chi_01.add(Chimu_01, Chimu_31) -Chi_02.add(Chimu_02, Chimu_32) -Chi_10.sub(Chimu_10, Chimu_20) -Chi_11.sub(Chimu_11, Chimu_21) -Chi_12.sub(Chimu_12, Chimu_22) -asmclose() -debugall('YM_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# FCADD issue latency = 1, latency is 6+9 -d['cycles_PROJ'] += 15 * d['factor'] -write('// ZM_PROJ') -definemultiline(F'ZM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -#pg1.loadpredication() -Chi_00.subTimesI(Chimu_00, Chimu_20) -Chi_01.subTimesI(Chimu_01, Chimu_21) -Chi_02.subTimesI(Chimu_02, Chimu_22) -Chi_10.addTimesI(Chimu_10, Chimu_30) -Chi_11.addTimesI(Chimu_11, Chimu_31) -Chi_12.addTimesI(Chimu_12, Chimu_32) -asmclose() -debugall('ZM_PROJ', group='Chi') -curlyclose() -newline() - -d['factor'] = 1 -# add/sub issue latency = 1, latency is 9 -d['cycles_PROJ'] += 9 * d['factor'] -write('// TM_PROJ') -definemultiline(F'TM_PROJ_{PRECSUFFIX}') -if ALTERNATIVE_LOADS == True: - write(' LOAD_CHIMU_0213_PLUG \\') -curlyopen() -asmopen() -pg1.loadpredication() -Chi_00.sub(Chimu_00, Chimu_20) -Chi_01.sub(Chimu_01, Chimu_21) -Chi_02.sub(Chimu_02, Chimu_22) -Chi_10.sub(Chimu_10, Chimu_30) -Chi_11.sub(Chimu_11, Chimu_31) -Chi_12.sub(Chimu_12, Chimu_32) -asmclose() -debugall('TM_PROJ', group='Chi') -curlyclose() -newline() - -# does not occur in GridBench -d['factor'] = 0 -# add/sub issue latency = 1, latency is 9 -d['cycles_RECON'] += 15 * d['factor'] -write('// XM_RECON_ACCUM') -definemultiline(F'XM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -# result_20.addTimesI(UChi_10) -# result_21.addTimesI(UChi_11) -# result_22.addTimesI(UChi_12) -# result_30.addTimesI(UChi_00) -# result_31.addTimesI(UChi_01) -# result_32.addTimesI(UChi_02) -# -# # result_00.move(UChi_00) -# # result_01.move(UChi_01) -# # result_02.move(UChi_02) -# # result_10.move(UChi_10) -# # result_11.move(UChi_11) -# # result_12.move(UChi_12) -# -# # faster than move ? -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) - -result_30.addTimesI(UChi_00) # reordered -result_31.addTimesI(UChi_01) -result_32.addTimesI(UChi_02) - -result_20.addTimesI(UChi_10) -result_21.addTimesI(UChi_11) -result_22.addTimesI(UChi_12) - -result_00.add(UChi_00) -result_01.add(UChi_01) -result_02.add(UChi_02) -result_10.add(UChi_10) -result_11.add(UChi_11) -result_12.add(UChi_12) -asmclose() -debugall('XM_RECON_ACCUM', group='result') -newline() - - - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// YP_RECON_ACCUM') -definemultiline(F'YP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.add(UChi_10) -# result_21.add(UChi_11) -# result_22.add(UChi_12) -# result_30.sub(UChi_00) -# result_31.sub(UChi_01) -# result_32.sub(UChi_02) - -result_00.add(UChi_00) # reordered -result_30.sub(UChi_00) - -result_01.add(UChi_01) -result_31.sub(UChi_01) - -result_02.add(UChi_02) -result_32.sub(UChi_02) - -result_10.add(UChi_10) -result_20.add(UChi_10) - -result_11.add(UChi_11) -result_21.add(UChi_11) - -result_12.add(UChi_12) -result_22.add(UChi_12) -asmclose() -debugall('YP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// YM_RECON_ACCUM') -definemultiline(F'YM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.sub(UChi_10) -# result_21.sub(UChi_11) -# result_22.sub(UChi_12) -# result_30.add(UChi_00) -# result_31.add(UChi_01) -# result_32.add(UChi_02) - -result_00.add(UChi_00) # reordered -result_30.add(UChi_00) - -result_01.add(UChi_01) -result_31.add(UChi_01) - -result_02.add(UChi_02) -result_32.add(UChi_02) - -result_10.add(UChi_10) -result_20.sub(UChi_10) - -result_11.add(UChi_11) -result_21.sub(UChi_11) - -result_12.add(UChi_12) -result_22.sub(UChi_12) -asmclose() -debugall('YM_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 15 * d['factor'] -write('// ZP_RECON_ACCUM') -definemultiline(F'ZP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_20.subTimesI(UChi_00) -# result_21.subTimesI(UChi_01) -# result_22.subTimesI(UChi_02) -# result_30.addTimesI(UChi_10) -# result_31.addTimesI(UChi_11) -# result_32.addTimesI(UChi_12) -# -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -result_20.subTimesI(UChi_00) # reordered -result_00.add(UChi_00) - -result_21.subTimesI(UChi_01) -result_01.add(UChi_01) - -result_22.subTimesI(UChi_02) -result_02.add(UChi_02) - -result_30.addTimesI(UChi_10) -result_10.add(UChi_10) - -result_31.addTimesI(UChi_11) -result_11.add(UChi_11) - -result_32.addTimesI(UChi_12) -result_12.add(UChi_12) -asmclose() -debugall('ZP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 15 * d['factor'] -write('// ZM_RECON_ACCUM') -definemultiline(F'ZM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_20.addTimesI(UChi_00) -# result_21.addTimesI(UChi_01) -# result_22.addTimesI(UChi_02) -# result_30.subTimesI(UChi_10) -# result_31.subTimesI(UChi_11) -# result_32.subTimesI(UChi_12) -# -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -result_20.addTimesI(UChi_00) # reordered -result_00.add(UChi_00) - -result_21.addTimesI(UChi_01) -result_01.add(UChi_01) - -result_22.addTimesI(UChi_02) -result_02.add(UChi_02) - -result_30.subTimesI(UChi_10) -result_10.add(UChi_10) - -result_31.subTimesI(UChi_11) -result_11.add(UChi_11) - -result_32.subTimesI(UChi_12) -result_12.add(UChi_12) -asmclose() -debugall('ZM_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// TP_RECON_ACCUM') -definemultiline(F'TP_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.add(UChi_00) -# result_21.add(UChi_01) -# result_22.add(UChi_02) -# result_30.add(UChi_10) -# result_31.add(UChi_11) -# result_32.add(UChi_12) - -result_00.add(UChi_00) # reordered -result_20.add(UChi_00) - -result_01.add(UChi_01) -result_21.add(UChi_01) - -result_02.add(UChi_02) -result_22.add(UChi_02) - -result_10.add(UChi_10) -result_30.add(UChi_10) - -result_11.add(UChi_11) -result_31.add(UChi_11) - -result_12.add(UChi_12) -result_32.add(UChi_12) -asmclose() -debugall('TP_RECON_ACCUM', group='result') -newline() - -d['factor'] = 1 -d['cycles_RECON'] += 9 * d['factor'] -write('// TM_RECON_ACCUM') -definemultiline(F'TM_RECON_ACCUM_{PRECSUFFIX}') -asmopen() -#pg1.loadpredication() -# result_00.add(UChi_00) -# result_01.add(UChi_01) -# result_02.add(UChi_02) -# result_10.add(UChi_10) -# result_11.add(UChi_11) -# result_12.add(UChi_12) -# result_20.sub(UChi_00) -# result_21.sub(UChi_01) -# result_22.sub(UChi_02) -# result_30.sub(UChi_10) -# result_31.sub(UChi_11) -# result_32.sub(UChi_12) - -result_00.add(UChi_00) # reordered -result_20.sub(UChi_00) - -result_01.add(UChi_01) -result_21.sub(UChi_01) - -result_02.add(UChi_02) -result_22.sub(UChi_02) - -result_10.add(UChi_10) -result_30.sub(UChi_10) - -result_11.add(UChi_11) -result_31.sub(UChi_11) - -result_12.add(UChi_12) -result_32.sub(UChi_12) -asmclose() -debugall('TM_RECON_ACCUM', group='result') -newline() - -d['factor'] = 0 -# have 12 instructions -# picking dual issue versions -d['cycles_ZERO_PSI'] += 6 * d['factor'] -write('// ZERO_PSI') -definemultiline(F'ZERO_PSI_{PRECSUFFIX}') -asmopen() -pg1.loadpredication() -result_00.zero() -result_01.zero() -result_02.zero() -result_10.zero() -result_11.zero() -result_12.zero() -result_20.zero() -result_21.zero() -result_22.zero() -result_30.zero() -result_31.zero() -result_32.zero() -asmclose() -#debugall('ZERO_PSI', group='result') -newline() - -# prefetch store spinors to L2 cache -d['factor'] = 0 -d['cycles_PREFETCH_L2'] += 0 * d['factor'] -write('// PREFETCH_RESULT_L2_STORE (prefetch store to L2)') -definemultiline(F'PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -fetch_base_ptr(F"base", target='A') -prefetch_L2_store(F"base", 0) -prefetch_L2_store(F"base", 1) -prefetch_L2_store(F"base", 2) -asmclose() -curlyclose() -newline() - -# prefetch store spinors to L1 cache -d['factor'] = 0 -d['cycles_PREFETCH_L1'] += 0 * d['factor'] -write('// PREFETCH_RESULT_L1_STORE (prefetch store to L1)') -definemultiline(F'PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(base)') -curlyopen() -fetch_base_ptr(F"base") -asmopen() -fetch_base_ptr(F"base", target='A') -prefetch_L1_store(F"base", 0) -prefetch_L1_store(F"base", 1) -prefetch_L1_store(F"base", 2) -asmclose() -curlyclose() -newline() - - -d['factor'] = 0 -write('// ADD_RESULT_INTERNAL') -definemultiline(F'ADD_RESULT_INTERNAL_{PRECSUFFIX}') -asmopen() -result_00.add(Chimu_00) -result_01.add(Chimu_01) -result_02.add(Chimu_02) -result_10.add(Chimu_10) -result_11.add(Chimu_11) -result_12.add(Chimu_12) -result_20.add(Chimu_20) -result_21.add(Chimu_21) -result_22.add(Chimu_22) -result_30.add(Chimu_30) -result_31.add(Chimu_31) -result_32.add(Chimu_32) -asmclose() -#debugall('ZERO_PSI', group='result') -newline() - -# -------------------------------------------------------------------------------- - -# C -f = open('w.h', 'w') -f.write(d['C']) -f.close() - -# intrin -f = open('wi.h', 'w') -f.write(d['I']) -f.close() - -filename = '' -if PRECISION == 'double': - filename = "Fujitsu_A64FX_intrin_double.h" -else: - filename = "Fujitsu_A64FX_intrin_single.h" -f = open(filename, 'w') -f.write(LEGAL.format(filename)) -f.write(d['I']) -f.close() - - -# asm -f = open('wa.h', 'w') -f.write(d['A']) -f.close() - -filename = '' -if PRECISION == 'double': - filename = "Fujitsu_A64FX_asm_double.h" -else: - filename = "Fujitsu_A64FX_asm_single.h" -f = open(filename, 'w') -f.write(LEGAL.format(filename)) -f.write(d['A']) -f.close() - - -# arithmetics instruction count, mul/mac = 2 instructions each -d['acount'] = d['add'] + d['sub'] + \ - d['mul'] + d['mac'] + d['addTimesI'] + d['subTimesI'] - -# permutations -d['permutes'] += 2*d['timesI'] + 1*d['timesMinusI'] -d['neg'] = 1*d['timesI'] + 1*d['timesMinusI'] - -# instruction count, mul/mac = 2 instructions each, +/- *i = 3 instructions each -d['icount'] = d['load'] + d['store'] + d['move'] + d['add'] + d['sub'] + \ - d['mul'] + d['mac'] + d['permutes'] + d['neg'] + \ - d['addTimesI'] + d['subTimesI'] + d['zero'] + d['movprfx'] - -# flops -d['flops'] = 4*d['mac'] + 3*d['mul'] + d['add'] + d['sub'] + \ - d['addTimesI'] + d['subTimesI'] - - - - - -print('Statistics') -print('') -print('Type Occurences Total / Arith instructions') -print('-------------------------------------------------------------------') -print('Variables {:4d}'.format(d['registers'])) -print('') -print('load {:4d}'.format(d['load'])) -print('store {:4d}'.format(d['store'])) -print('move {:4d}'.format(d['move'])) -print('movprfx {:4d}'.format(d['movprfx'])) -print('zero {:4d}'.format(d['zero'])) -print('negate {:4d}'.format(d['neg'])) - - -print('add {:4d} {:0.2f} / {:0.2f}'.\ - format(d['add'], d['add'] / d['icount'], d['add'] / d['acount'])) -print('sub {:4d} {:0.2f} / {:0.2f}'.\ - format(d['sub'], d['sub'] / d['icount'], d['sub'] / d['acount'])) -print('mul {:4d} {:0.2f} / {:0.2f}'.\ - format(d['mul'], 2*d['mul'] / d['icount'], 2*d['mul'] / d['acount'])) -print('mac {:4d} {:0.2f} / {:0.2f}'.\ - format(d['mac'], 2*d['mac'] / d['icount'], 2*d['mac'] / d['acount'])) -print('addTimesI {:4d} {:0.2f} / {:0.2f}'.\ - format(d['addTimesI'], 2*d['addTimesI'] / d['icount'], 2*d['addTimesI'] / d['acount'])) -print('subTimesI {:4d} {:0.2f} / {:0.2f}'.\ - format(d['subTimesI'], 2*d['subTimesI'] / d['icount'], 2*d['subTimesI'] / d['acount'])) - -print('timesI {:4d}'.format(d['timesI'])) -print('timesMinusI {:4d}'.format(d['timesMinusI'])) -print('permutes {:4d} {:0.2f}'.\ - format(d['permutes'], d['permutes'] / d['icount'])) -print('') -print('flops {:4d}'.format(d['flops'])) -print('instruction count {:4d}'.format(d['icount'])) -print('arith. instruction count {:4d} {:0.2f}'.\ - format(d['acount'], d['acount'] / d['icount'])) - - -# ---- static pipeline resources consumption ---- -FLA = 0 -FLA += 2 * d['mac'] + 2 * d['mul'] -FLA += 1 * d['addTimesI'] + 1 * d['subTimesI'] -FLA += 1 * d['move'] -FLA += 1 * d['permutes'] -FLA += 1 * d['store'] -FLA += 1 * d['zero'] - -FLB = 0 -FLB += 1 * d['addTimesI'] + 1 * d['subTimesI'] - -FLAB = 0 -FLAB += 1 * d['mac'] + 1 * d['mul'] -FLAB += 1 * d['add'] + 1 * d['sub'] -FLAB += 1 * d['neg'] + 1 * d['movprfx'] -#FLAB += 1 * d['zero'] - - -FL_slots = 2 * d['icount'] -FL_micro_ops = FLA + FLB + FLAB - -print('') -print('------------------------------------------------------------------') -print('') -print('Static FL slot usage') -print('') -print(' FLA {:4d}'.format(FLA)) -print(' FLB {:4d}'.format(FLB)) -print(' FLA/B {:4d}'.format(FLAB)) - -print('') -print('Static FL slot efficiency') -print('') -print(' Total FL slots {:4d}'.format(FL_slots)) -print(' FL slots occupied {:4d}'.format(FL_micro_ops)) -print(' FL slot efficiency {:0.2f}'.format(FL_micro_ops / FL_slots)) - -cycles_total = d['cycles_ZERO_PSI'] + d['cycles_LOAD_CHIMU'] + \ - d['cycles_PROJ'] + d['cycles_PERM'] + d['cycles_MULT_2SPIN'] + \ - d['cycles_RECON'] + d['cycles_RESULT'] -cycles_total_hidden = d['cycles_ZERO_PSI'] + \ - d['cycles_PROJ'] + d['cycles_MULT_2SPIN'] + \ - d['cycles_RECON'] - -# ---- dynamic estimate ---- - -print('') -print('Dynamic cycles estimate (incl. latencies)') -print('') -print(' ZERO_PSI {:4d}'.format(d['cycles_ZERO_PSI'])) -print(' LOAD_CHIMU {:4d}'.format(d['cycles_LOAD_CHIMU'])) -print(' PROJ {:4d}'.format(d['cycles_PROJ'])) -print(' PERM {:4d}'.format(d['cycles_PERM'])) -print(' MULT_2SPIN {:4d}'.format(d['cycles_MULT_2SPIN'])) -print(' RECON {:4d}'.format(d['cycles_RECON'])) -print(' STORE {:4d}'.format(d['cycles_RESULT'])) -print('') -print(' Sum {:4d}'.format(cycles_total)) -print('') -print(' Sum* {:4d}'.format(cycles_total_hidden)) -print(' Total FL slots* {:4d}'.format(cycles_total_hidden * 2)) -print(' FL slots occupied* {:4d}'.format(FL_micro_ops)) -print(' FL slot efficiency* {:0.2f}'.format(FL_micro_ops / (2*cycles_total_hidden))) -print('') -print(' *load/store/PERM hidden') - -estimated_cycles = cycles_total_hidden -# Estimate percent peak DP; dual issue, fma -pp = 100 * 4 * d['flops'] / (2*2*8*estimated_cycles) -print('') -print('Model prediction') -print('') -print(' Cycles* {:4d}'.format(estimated_cycles)) -print(' Percent peak* {:4.1f} %'.format(pp)) - -# estimated RF throughput in GB/s @ 2.2 GHz -tp10 = (d['load'] + d['store']) * 64 * 2.2 / estimated_cycles -tp2 = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / estimated_cycles -print('') -print(' Estimated RF throughput* {:4.1f} GB/s'.\ - format(tp10)) -print(' Estimated RF throughput* {:4.1f} GiB/s'.\ - format(tp2)) - -# ---- dynamic pipeline resources consumption ---- - -runtime = measured_cycles # runtime in cycles -pp_runtime = 100 * 4 * d['flops'] / (2*2*8*runtime) -runtime_FL_slots = 2 * runtime -delta = runtime - estimated_cycles - - -print('') -print('------------------------------------------------------------------') -print('') -print('Dynamic runtime analysis (cycles from measurements)') -print('') -print(' Cycles {:4d}'.format(runtime)) -print(' Percent peak {:4.1f} %'.format(pp_runtime)) -print(' Deviation from estimate {:4d} {:4.2f} %'.\ - format(delta, 100. * abs(delta/runtime))) -print(' Deviation per direction {:4.1f}'.format(delta/8)) - -# estimated RF throughput in GB/s @ 2.2 GHz -tp10_rt = (d['load'] + d['store']) * 64 * 2.2 / runtime -tp2_rt = (d['load'] + d['store']) * 64 * 1000.**3 * 2.2 / 1024.**3 / runtime -print('') -print(' RF throughput {:4.1f} GB/s'.\ - format(tp10_rt)) -print(' RF throughput {:4.1f} GiB/s'.\ - format(tp2_rt)) -print('') -print(' Total FL slots {:4d}'.format(runtime_FL_slots)) -print(' FL slots occupied {:4d}'.format(FL_micro_ops)) -print(' FL slot efficiency {:0.2f}'.format(FL_micro_ops / runtime_FL_slots)) -print('') diff --git a/Grid/stencil/SimpleCompressor.h b/Grid/stencil/SimpleCompressor.h index be7c89c0..2ce48369 100644 --- a/Grid/stencil/SimpleCompressor.h +++ b/Grid/stencil/SimpleCompressor.h @@ -7,20 +7,20 @@ template class SimpleCompressor { public: void Point(int) {}; - accelerator_inline int CommDatumSize(void) { return sizeof(vobj); } - accelerator_inline bool DecompressionStep(void) { return false; } - template accelerator_inline void Compress(cobj *buf,int o,const cobj &in) { buf[o]=in; } - accelerator_inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o){ + accelerator_inline int CommDatumSize(void) const { return sizeof(vobj); } + accelerator_inline bool DecompressionStep(void) const { return false; } + template accelerator_inline void Compress(cobj *buf,int o,const cobj &in) const { buf[o]=in; } + accelerator_inline void Exchange(vobj *mp,vobj *vp0,vobj *vp1,Integer type,Integer o) const { exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type); } - accelerator_inline void Decompress(vobj *out,vobj *in, int o){ assert(0); } + accelerator_inline void Decompress(vobj *out,vobj *in, int o) const { assert(0); } accelerator_inline void CompressExchange(vobj *out0,vobj *out1,const vobj *in, - int j,int k, int m,int type){ + int j,int k, int m,int type) const { exchange(out0[j],out1[j],in[k],in[m],type); } // For cshift. Cshift should drop compressor coupling altogether // because I had to decouple the code from the Stencil anyway - accelerator_inline vobj operator() (const vobj &arg) { + accelerator_inline vobj operator() (const vobj &arg) const { return arg; } }; diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 1e198972..58cebed3 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -147,16 +147,16 @@ class CartesianStencilAccelerator { cobj* u_recv_buf_p; cobj* u_send_buf_p; - accelerator_inline cobj *CommBuf(void) { return u_recv_buf_p; } + accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; } - accelerator_inline int GetNodeLocal(int osite,int point) { + accelerator_inline int GetNodeLocal(int osite,int point) const { return this->_entries_p[point+this->_npoints*osite]._is_local; } - accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) { + accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const { ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite]; } - accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { + accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const { uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; local = this->_entries_p[ent]._is_local; perm = this->_entries_p[ent]._permute; @@ -168,14 +168,14 @@ class CartesianStencilAccelerator { } } - accelerator_inline uint64_t GetPFInfo(int ent,uint64_t base) { + accelerator_inline uint64_t GetPFInfo(int ent,uint64_t base) const { uint64_t cbase = (uint64_t)&u_recv_buf_p[0]; int local = this->_entries_p[ent]._is_local; if (local) return base + this->_entries_p[ent]._byte_offset; else return cbase + this->_entries_p[ent]._byte_offset; } - accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane) + accelerator_inline void iCoorFromIindex(Coordinate &coor,int lane) const { Lexicographic::CoorFromIndex(coor,lane,this->_simd_layout); } @@ -221,7 +221,7 @@ public: typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; typedef typename cobj::scalar_object scalar_object; - typedef CartesianStencilView View_type; + typedef const CartesianStencilView View_type; typedef typename View_type::StencilVector StencilVector; /////////////////////////////////////////// // Helper structs @@ -269,7 +269,7 @@ public: std::vector > > face_table ; Vector surface_list; - Vector _entries; // Resident in managed memory + stencilVector _entries; // Resident in managed memory std::vector Packets; std::vector Mergers; std::vector MergersSHM; diff --git a/Grid/tensors/Tensor_SIMT.h b/Grid/tensors/Tensor_SIMT.h index ec57a679..0a7d3382 100644 --- a/Grid/tensors/Tensor_SIMT.h +++ b/Grid/tensors/Tensor_SIMT.h @@ -64,6 +64,71 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ } #else + +//#ifndef GRID_SYCL +#if 1 +// Use the scalar as our own complex on GPU ... thrust::complex or std::complex +template = 0> accelerator_inline +typename vsimd::scalar_type +coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::scalar_type S; + S * __restrict__ p=(S *)&vec; + return p[lane]; +} +template = 0> accelerator_inline +typename vsimd::scalar_type +coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::scalar_type S; + + S * __restrict__ p=(S *)&vec; + int mask = vsimd::Nsimd() >> (ptype + 1); + int plane= doperm ? lane ^ mask : lane; + return p[plane]; +} +template = 0> accelerator_inline +void coalescedWrite(vsimd & __restrict__ vec, + const typename vsimd::scalar_type & __restrict__ extracted, + int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::scalar_type S; + S * __restrict__ p=(S *)&vec; + p[lane]=extracted; +} +#else +// For SyCL have option to use GpuComplex from inside the vector type in SIMT loops +// Faster for some reason +template = 0> accelerator_inline +typename vsimd::vector_type::datum +coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::vector_type::datum S; + S * __restrict__ p=(S *)&vec; + return p[lane]; +} +template = 0> accelerator_inline +typename vsimd::vector_type::datum +coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::vector_type::datum S; + + S * __restrict__ p=(S *)&vec; + int mask = vsimd::Nsimd() >> (ptype + 1); + int plane= doperm ? lane ^ mask : lane; + return p[plane]; +} +template = 0> accelerator_inline +void coalescedWrite(vsimd & __restrict__ vec, + const typename vsimd::vector_type::datum & __restrict__ extracted, + int lane=acceleratorSIMTlane(vsimd::Nsimd())) +{ + typedef typename vsimd::vector_type::datum S; + S * __restrict__ p=(S *)&vec; + p[lane]=extracted; +} +#endif + ////////////////////////////////////////// // Extract and insert slices on the GPU ////////////////////////////////////////// diff --git a/Grid/tensors/Tensor_Ta.h b/Grid/tensors/Tensor_Ta.h index 1ef9fc23..90e57b2b 100644 --- a/Grid/tensors/Tensor_Ta.h +++ b/Grid/tensors/Tensor_Ta.h @@ -95,14 +95,18 @@ accelerator_inline iMatrix ProjectOnGroup(const iMatrix &arg) vtype nrm; vtype inner; for(int c1=0;c1 ProjectOnGroup(const iMatrix &arg) ret._internal[b][c] -= pr * ret._internal[c1][c]; } } - + } + + // Normalise last row + { + int c1 = N-1; + zeroit(inner); + for(int c2=0;c2 #ifndef GRID_MATH_EXP_H #define GRID_MATH_EXP_H -#define DEFAULT_MAT_EXP 12 +#define DEFAULT_MAT_EXP 20 NAMESPACE_BEGIN(Grid); diff --git a/Grid/tensors/Tensor_extract_merge.h b/Grid/tensors/Tensor_extract_merge.h index f1ded209..ea619d0f 100644 --- a/Grid/tensors/Tensor_extract_merge.h +++ b/Grid/tensors/Tensor_extract_merge.h @@ -1,5 +1,5 @@ /************************************************************************************* - +n Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/tensors/Tensor_extract_merge.h @@ -153,7 +153,7 @@ void insertLane(int lane, vobj & __restrict__ vec,const typename vobj::scalar_ob // Extract to a bunch of scalar object pointers of different scalar type, with offset. Useful for precision change //////////////////////////////////////////////////////////////////////// template accelerator -void extract(const vobj &vec,ExtractPointerArray &extracted, int offset) +void extract(const vobj &vec,const ExtractPointerArray &extracted, int offset) { typedef typename GridTypeMapper::scalar_type sobj_scalar_type; typedef typename GridTypeMapper::scalar_type scalar_type; @@ -181,7 +181,7 @@ void extract(const vobj &vec,ExtractPointerArray &extracted, int offset) // Merge bunch of scalar object pointers of different scalar type, with offset. Useful for precision change //////////////////////////////////////////////////////////////////////// template accelerator -void merge(vobj &vec,ExtractPointerArray &extracted, int offset) +void merge(vobj &vec,const ExtractPointerArray &extracted, int offset) { typedef typename GridTypeMapper::scalar_type sobj_scalar_type; typedef typename GridTypeMapper::scalar_type scalar_type; diff --git a/Grid/tensors/Tensor_outer.h b/Grid/tensors/Tensor_outer.h index 4902c22f..a32a2a91 100644 --- a/Grid/tensors/Tensor_outer.h +++ b/Grid/tensors/Tensor_outer.h @@ -34,6 +34,16 @@ NAMESPACE_BEGIN(Grid); // outerProduct Scalar x Scalar -> Scalar // Vector x Vector -> Matrix /////////////////////////////////////////////////////////////////////////////////////// +template = 0> +accelerator_inline CC outerProduct(const CC &l, const CC& r) +{ + return l*conj(r); +} +template = 0> +accelerator_inline RR outerProduct(const RR &l, const RR& r) +{ + return l*r; +} template accelerator_inline auto outerProduct (const iVector& lhs,const iVector& rhs) -> iMatrix @@ -57,17 +67,6 @@ auto outerProduct (const iScalar& lhs,const iScalar& rhs) -> iScalar = 0> -accelerator_inline CC outerProduct(const CC &l, const CC& r) -{ - return l*conj(r); -} -template = 0> -accelerator_inline RR outerProduct(const RR &l, const RR& r) -{ - return l*r; -} - NAMESPACE_END(Grid); #endif diff --git a/Grid/tensors/Tensor_unary.h b/Grid/tensors/Tensor_unary.h index 3adc2897..9686fd49 100644 --- a/Grid/tensors/Tensor_unary.h +++ b/Grid/tensors/Tensor_unary.h @@ -84,7 +84,6 @@ NAMESPACE_BEGIN(Grid); } UNARY(sqrt); -UNARY(rsqrt); UNARY(sin); UNARY(cos); UNARY(asin); diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc index 2134d158..9c40f538 100644 --- a/Grid/threads/Accelerator.cc +++ b/Grid/threads/Accelerator.cc @@ -1,6 +1,7 @@ #include NAMESPACE_BEGIN(Grid); +int acceleratorAbortOnGpuError=1; uint32_t accelerator_threads=2; uint32_t acceleratorThreads(void) {return accelerator_threads;}; void acceleratorThreads(uint32_t t) {accelerator_threads = t;}; @@ -21,22 +22,26 @@ void acceleratorInit(void) #define ENV_RANK_SLURM "SLURM_PROCID" #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK" #define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK" - // We extract the local rank initialization using an environment variable - if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { - printf("OPENMPI detected\n"); - rank = atoi(localRankStr); - } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) { - printf("MVAPICH detected\n"); - rank = atoi(localRankStr); - } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) { - printf("SLURM detected\n"); - rank = atoi(localRankStr); - } else { - printf("MPI version is unknown - bad things may happen\n"); - } if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);} if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);} + // We extract the local rank initialization using an environment variable + if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) { + if (!world_rank) + printf("OPENMPI detected\n"); + rank = atoi(localRankStr); + } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) { + if (!world_rank) + printf("MVAPICH detected\n"); + rank = atoi(localRankStr); + } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) { + if (!world_rank) + printf("SLURM detected\n"); + rank = atoi(localRankStr); + } else { + if (!world_rank) + printf("MPI version is unknown - bad things may happen\n"); + } size_t totalDeviceMem=0; for (int i = 0; i < nDevices; i++) { @@ -48,7 +53,6 @@ void acceleratorInit(void) prop = gpu_props[i]; totalDeviceMem = prop.totalGlobalMem; if ( world_rank == 0) { -#ifndef GRID_IBM_SUMMIT if ( i==rank ) { printf("AcceleratorCudaInit[%d]: ========================\n",rank); printf("AcceleratorCudaInit[%d]: Device Number : %d\n", rank,i); @@ -62,8 +66,8 @@ void acceleratorInit(void) GPU_PROP(warpSize); GPU_PROP(pciBusID); GPU_PROP(pciDeviceID); + printf("AcceleratorCudaInit[%d]: maxGridSize (%d,%d,%d)\n",rank,prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2]); } -#endif // GPU_PROP(unifiedAddressing); // GPU_PROP(l2CacheSize); // GPU_PROP(singleToDoublePrecisionPerfRatio); @@ -73,11 +77,17 @@ void acceleratorInit(void) #undef GPU_PROP_FMT #undef GPU_PROP -#ifdef GRID_IBM_SUMMIT +#ifdef GRID_DEFAULT_GPU // IBM Jsrun makes cuda Device numbering screwy and not match rank - if ( world_rank == 0 ) printf("AcceleratorCudaInit: IBM Summit or similar - use default device\n"); + if ( world_rank == 0 ) { + printf("AcceleratorCudaInit: using default device \n"); + printf("AcceleratorCudaInit: assume user either uses a) IBM jsrun, or \n"); + printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n"); + printf("AcceleratorCudaInit: Configure options --enable-summit, --enable-select-gpu=no \n"); + } #else printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank); + printf("AcceleratorCudaInit: Configure options --enable-select-gpu=yes \n"); cudaSetDevice(rank); #endif if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n"); @@ -139,11 +149,18 @@ void acceleratorInit(void) MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours #undef GPU_PROP_FMT #undef GPU_PROP -#ifdef GRID_IBM_SUMMIT - // IBM Jsrun makes cuda Device numbering screwy and not match rank - if ( world_rank == 0 ) printf("AcceleratorHipInit: IBM Summit or similar - NOT setting device to node rank\n"); + +#ifdef GRID_DEFAULT_GPU + if ( world_rank == 0 ) { + printf("AcceleratorHipInit: using default device \n"); + printf("AcceleratorHipInit: assume user either uses a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n"); + printf("AcceleratorHipInit: Configure options --enable-summit, --enable-select-gpu=no \n"); + } #else - if ( world_rank == 0 ) printf("AcceleratorHipInit: setting device to node rank\n"); + if ( world_rank == 0 ) { + printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank); + printf("AcceleratorHipInit: Configure options --enable-select-gpu=yes \n"); + } hipSetDevice(rank); #endif if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n"); @@ -154,7 +171,6 @@ void acceleratorInit(void) #ifdef GRID_SYCL cl::sycl::queue *theGridAccelerator; - void acceleratorInit(void) { int nDevices = 1; @@ -162,6 +178,10 @@ void acceleratorInit(void) cl::sycl::device selectedDevice { selector }; theGridAccelerator = new sycl::queue (selectedDevice); +#ifdef GRID_SYCL_LEVEL_ZERO_IPC + zeInit(0); +#endif + char * localRankStr = NULL; int rank = 0, world_rank=0; #define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK" diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index eb1cfb94..7ff3e1e4 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -39,6 +39,10 @@ Author: paboyle #ifdef HAVE_MM_MALLOC_H #include #endif +#ifdef __APPLE__ +// no memalign +inline void *memalign(size_t align, size_t bytes) { return malloc(bytes); } +#endif NAMESPACE_BEGIN(Grid); @@ -100,9 +104,11 @@ void acceleratorInit(void); #define accelerator __host__ __device__ #define accelerator_inline __host__ __device__ inline +extern int acceleratorAbortOnGpuError; + accelerator_inline int acceleratorSIMTlane(int Nsimd) { #ifdef GRID_SIMT - return threadIdx.z; + return threadIdx.x; #else return 0; #endif @@ -110,36 +116,77 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { #define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \ { \ + int nt=acceleratorThreads(); \ typedef uint64_t Iterator; \ auto lambda = [=] accelerator \ (Iterator iter1,Iterator iter2,Iterator lane) mutable { \ __VA_ARGS__; \ }; \ - int nt=acceleratorThreads(); \ - dim3 cu_threads(acceleratorThreads(),1,nsimd); \ + dim3 cu_threads(nsimd,acceleratorThreads(),1); \ dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \ LambdaApply<<>>(num1,num2,nsimd,lambda); \ } +#define accelerator_for6dNB(iter1, num1, \ + iter2, num2, \ + iter3, num3, \ + iter4, num4, \ + iter5, num5, \ + iter6, num6, ... ) \ + { \ + typedef uint64_t Iterator; \ + auto lambda = [=] accelerator \ + (Iterator iter1,Iterator iter2, \ + Iterator iter3,Iterator iter4, \ + Iterator iter5,Iterator iter6) mutable { \ + __VA_ARGS__; \ + }; \ + dim3 cu_blocks (num1,num2,num3); \ + dim3 cu_threads(num4,num5,num6); \ + Lambda6Apply<<>>(num1,num2,num3,num4,num5,num6,lambda); \ + } + template __global__ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda) { - uint64_t x = threadIdx.x + blockDim.x*blockIdx.x; - uint64_t y = threadIdx.y + blockDim.y*blockIdx.y; - uint64_t z = threadIdx.z; + // Weird permute is to make lane coalesce for large blocks + uint64_t x = threadIdx.y + blockDim.y*blockIdx.x; + uint64_t y = threadIdx.z + blockDim.z*blockIdx.y; + uint64_t z = threadIdx.x; if ( (x < num1) && (y __global__ +void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3, + uint64_t num4, uint64_t num5, uint64_t num6, + lambda Lambda) +{ + uint64_t iter1 = blockIdx.x; + uint64_t iter2 = blockIdx.y; + uint64_t iter3 = blockIdx.z; + uint64_t iter4 = threadIdx.x; + uint64_t iter5 = threadIdx.y; + uint64_t iter6 = threadIdx.z; + + if ( (iter1 < num1) && (iter2 #include + +#define GRID_SYCL_LEVEL_ZERO_IPC + +#ifdef GRID_SYCL_LEVEL_ZERO_IPC +#include +#include +#endif NAMESPACE_BEGIN(Grid); extern cl::sycl::queue *theGridAccelerator; @@ -217,11 +274,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { unsigned long nt=acceleratorThreads(); \ unsigned long unum1 = num1; \ unsigned long unum2 = num2; \ + if(nt < 8)nt=8; \ cl::sycl::range<3> local {nt,1,nsimd}; \ cl::sycl::range<3> global{unum1,unum2,nsimd}; \ cgh.parallel_for( \ cl::sycl::nd_range<3>(global,local), \ - [=] (cl::sycl::nd_item<3> item) mutable { \ + [=] (cl::sycl::nd_item<3> item) /*mutable*/ \ + [[intel::reqd_sub_group_size(8)]] \ + { \ auto iter1 = item.get_global_id(0); \ auto iter2 = item.get_global_id(1); \ auto lane = item.get_global_id(2); \ @@ -235,8 +295,10 @@ inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*t inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; +inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theGridAccelerator->memcpy(to,from,bytes); theGridAccelerator->wait();} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { theGridAccelerator->memset(base,value,bytes); theGridAccelerator->wait();} inline int acceleratorIsCommunicable(void *ptr) { #if 0 @@ -334,10 +396,12 @@ inline void *acceleratorAllocDevice(size_t bytes) return ptr; }; -inline void acceleratorFreeShared(void *ptr){ free(ptr);}; +inline void acceleratorFreeShared(void *ptr){ hipFree(ptr);}; inline void acceleratorFreeDevice(void *ptr){ hipFree(ptr);}; inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);} +inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);} +inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(base,value,bytes);} #endif @@ -360,11 +424,13 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemc ////////////////////////////////////////////// // CPU Target - No accelerator just thread instead ////////////////////////////////////////////// -#define GRID_ALLOC_ALIGN (2*1024*1024) // 2MB aligned + #if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) ) #undef GRID_SIMT + + #define accelerator #define accelerator_inline strong_inline #define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ }); @@ -375,8 +441,10 @@ inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ hipMemc accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ memcpy(to,from,bytes);} +inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes) { memcpy(to,from,bytes);} inline int acceleratorIsCommunicable(void *ptr){ return 1; } +inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);} #ifdef HAVE_MM_MALLOC_H inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);}; @@ -399,6 +467,8 @@ inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN, inline void acceleratorFreeCpu (void *ptr){free(ptr);}; #endif + + /////////////////////////////////////////////////// // Synchronise across local threads for divergence resynch /////////////////////////////////////////////////// @@ -409,7 +479,7 @@ accelerator_inline void acceleratorSynchronise(void) __syncwarp(); #endif #ifdef GRID_SYCL - // No barrier call on SYCL?? // Option get __spir:: stuff to do warp barrier + //cl::sycl::detail::workGroupBarrier(); #endif #ifdef GRID_HIP __syncthreads(); diff --git a/Grid/util/CompilerCompatible.h b/Grid/util/CompilerCompatible.h index 37331668..7c4a056d 100644 --- a/Grid/util/CompilerCompatible.h +++ b/Grid/util/CompilerCompatible.h @@ -1,5 +1,16 @@ #pragma once +#if defined(__NVCC__) + +#if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 0) +#error "NVCC version 11.0 breaks on Ampere, see Github issue 346" +#endif +#if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 1) +#error "NVCC version 11.1 breaks on Ampere, see Github issue 346" +#endif + +#endif + #if defined(__clang__) #if __clang_major__ < 3 diff --git a/Grid/util/Init.cc b/Grid/util/Init.cc index d81fafb3..77c374bf 100644 --- a/Grid/util/Init.cc +++ b/Grid/util/Init.cc @@ -56,6 +56,8 @@ Author: paboyle static int feenableexcept (unsigned int excepts) { +#if 0 + // Fails on Apple M1 static fenv_t fenv; unsigned int new_excepts = excepts & FE_ALL_EXCEPT; unsigned int old_excepts; // previous masks @@ -70,6 +72,8 @@ feenableexcept (unsigned int excepts) iold_excepts = (int) old_excepts; return ( fesetenv (&fenv) ? -1 : iold_excepts ); +#endif + return 0; } #endif @@ -140,7 +144,7 @@ void GridCmdOptionCSL(std::string str,std::vector & vec) } template -void GridCmdOptionIntVector(std::string &str,VectorInt & vec) +void GridCmdOptionIntVector(const std::string &str,VectorInt & vec) { vec.resize(0); std::stringstream ss(str); @@ -153,6 +157,9 @@ void GridCmdOptionIntVector(std::string &str,VectorInt & vec) return; } +template void GridCmdOptionIntVector(const std::string &str,std::vector & vec); +template void GridCmdOptionIntVector(const std::string &str,Coordinate & vec); + void GridCmdOptionInt(std::string &str,int & val) { std::stringstream ss(str); @@ -480,11 +487,13 @@ void Grid_init(int *argc,char ***argv) LebesgueOrder::UseLebesgueOrder=1; } CartesianCommunicator::nCommThreads = 1; +#ifdef GRID_COMMS_THREADS if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads"); GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads); assert(CartesianCommunicator::nCommThreads > 0); } +#endif if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); GridCmdOptionIntVector(arg,LebesgueOrder::Block); diff --git a/Grid/util/Init.h b/Grid/util/Init.h index 0a7baca6..2b53739b 100644 --- a/Grid/util/Init.h +++ b/Grid/util/Init.h @@ -55,7 +55,7 @@ template std::string GridCmdVectorIntToString(const VectorInt & vec); void GridCmdOptionCSL(std::string str,std::vector & vec); template -void GridCmdOptionIntVector(std::string &str,VectorInt & vec); +void GridCmdOptionIntVector(const std::string &str,VectorInt & vec); void GridCmdOptionInt(std::string &str,int & val); // ypj [add] void GridCmdOptionFloat(std::string &str,double & val); diff --git a/HMC/Mobius2p1fRHMC.cc b/HMC/Mobius2p1fRHMC.cc index 82ca4d37..b958d548 100644 --- a/HMC/Mobius2p1fRHMC.cc +++ b/HMC/Mobius2p1fRHMC.cc @@ -56,12 +56,12 @@ int main(int argc, char **argv) { MD.trajL = 1.0; HMCparameters HMCparams; - HMCparams.StartTrajectory = 30; + HMCparams.StartTrajectory = 0; HMCparams.Trajectories = 200; HMCparams.NoMetropolisUntil= 0; // "[HotStart, ColdStart, TepidStart, CheckpointStart]\n"; - // HMCparams.StartingType =std::string("ColdStart"); - HMCparams.StartingType =std::string("CheckpointStart"); + HMCparams.StartingType =std::string("ColdStart"); + // HMCparams.StartingType =std::string("CheckpointStart"); HMCparams.MD = MD; HMCWrapper TheHMC(HMCparams); diff --git a/Makefile.am b/Makefile.am index 33b25026..d2a1a326 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,5 +1,5 @@ # additional include paths necessary to compile the C++ library -SUBDIRS = Grid HMC benchmarks tests +SUBDIRS = Grid HMC benchmarks tests examples include $(top_srcdir)/doxygen.inc diff --git a/README b/README index 86506f52..0beabff3 100644 --- a/README +++ b/README @@ -111,11 +111,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu ``` bash mkdir build; cd build -../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix= +../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix= ``` -where `--enable-precision=` set the default precision, -`--enable-simd=` set the SIMD type, `--enable- +where `--enable-simd=` set the SIMD type, `--enable- comms=`, and `` should be replaced by the prefix path where you want to install Grid. Other options are detailed in the next section, you can also use `configure --help` to display them. Like with any other program using GNU autotool, the @@ -146,8 +145,8 @@ If you want to build all the tests at once just use `make tests`. - `--enable-numa`: enable NUMA first touch optimisation - `--enable-simd=`: setup Grid for the SIMD target `` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. - `--enable-gen-simd-width=`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). -- `--enable-precision={single|double}`: set the default precision (default: `double`). -- `--enable-precision=`: Use `` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. +- `--enable-precision={single|double}`: set the default precision (default: `double`). **Deprecated option** +- `--enable-comms=`: Use `` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `). - `--disable-timers`: disable system dependent high-resolution timers. - `--enable-chroma`: enable Chroma regression tests. @@ -201,8 +200,7 @@ Alternatively, some CPU codenames can be directly used: The following configuration is recommended for the Intel Knights Landing platform: ``` bash -../configure --enable-precision=double\ - --enable-simd=KNL \ +../configure --enable-simd=KNL \ --enable-comms=mpi-auto \ --enable-mkl \ CXX=icpc MPICXX=mpiicpc @@ -212,8 +210,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: ``` bash -../configure --enable-precision=double\ - --enable-simd=KNL \ +../configure --enable-simd=KNL \ --enable-comms=mpi \ --enable-mkl \ CXX=CC CC=cc @@ -232,8 +229,7 @@ for interior communication. This is the mpi3 communications implementation. We recommend four ranks per node for best performance, but optimum is local volume dependent. ``` bash -../configure --enable-precision=double\ - --enable-simd=KNL \ +../configure --enable-simd=KNL \ --enable-comms=mpi3-auto \ --enable-mkl \ CC=icpc MPICXX=mpiicpc @@ -244,8 +240,7 @@ We recommend four ranks per node for best performance, but optimum is local volu The following configuration is recommended for the Intel Haswell platform: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX2 \ +../configure --enable-simd=AVX2 \ --enable-comms=mpi3-auto \ --enable-mkl \ CXX=icpc MPICXX=mpiicpc @@ -262,8 +257,7 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX2 \ +../configure --enable-simd=AVX2 \ --enable-comms=mpi3 \ --enable-mkl \ CXX=CC CC=cc @@ -280,8 +274,7 @@ This is the default. The following configuration is recommended for the Intel Skylake platform: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX512 \ +../configure --enable-simd=AVX512 \ --enable-comms=mpi3 \ --enable-mkl \ CXX=mpiicpc @@ -298,8 +291,7 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX512 \ +../configure --enable-simd=AVX512 \ --enable-comms=mpi3 \ --enable-mkl \ CXX=CC CC=cc @@ -330,8 +322,7 @@ and 8 threads per rank. The following configuration is recommended for the AMD EPYC platform. ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX2 \ +../configure --enable-simd=AVX2 \ --enable-comms=mpi3 \ CXX=mpicxx ``` diff --git a/README.md b/README.md index 9f690ce0..88b922a5 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid) +# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview) **Data parallel C++ mathematical object library.** @@ -115,11 +115,10 @@ Now you can execute the `configure` script to generate makefiles (here from a bu ``` bash mkdir build; cd build -../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix= +../configure --enable-simd=AVX --enable-comms=mpi-auto --prefix= ``` -where `--enable-precision=` set the default precision, -`--enable-simd=` set the SIMD type, `--enable- +where `--enable-simd=` set the SIMD type, `--enable- comms=`, and `` should be replaced by the prefix path where you want to install Grid. Other options are detailed in the next section, you can also use `configure --help` to display them. Like with any other program using GNU autotool, the @@ -150,8 +149,7 @@ If you want to build all the tests at once just use `make tests`. - `--enable-numa`: enable NUMA first touch optimisation - `--enable-simd=`: setup Grid for the SIMD target `` (default: `GEN`). A list of possible SIMD targets is detailed in a section below. - `--enable-gen-simd-width=`: select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). -- `--enable-precision={single|double}`: set the default precision (default: `double`). -- `--enable-precision=`: Use `` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. +- `--enable-comms=`: Use `` for message passing (default: `none`). A list of possible SIMD targets is detailed in a section below. - `--enable-rng={sitmo|ranlux48|mt19937}`: choose the RNG (default: `sitmo `). - `--disable-timers`: disable system dependent high-resolution timers. - `--enable-chroma`: enable Chroma regression tests. @@ -205,8 +203,7 @@ Alternatively, some CPU codenames can be directly used: The following configuration is recommended for the Intel Knights Landing platform: ``` bash -../configure --enable-precision=double\ - --enable-simd=KNL \ +../configure --enable-simd=KNL \ --enable-comms=mpi-auto \ --enable-mkl \ CXX=icpc MPICXX=mpiicpc @@ -216,8 +213,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: ``` bash -../configure --enable-precision=double\ - --enable-simd=KNL \ +../configure --enable-simd=KNL \ --enable-comms=mpi \ --enable-mkl \ CXX=CC CC=cc @@ -236,8 +232,7 @@ for interior communication. This is the mpi3 communications implementation. We recommend four ranks per node for best performance, but optimum is local volume dependent. ``` bash -../configure --enable-precision=double\ - --enable-simd=KNL \ +../configure --enable-simd=KNL \ --enable-comms=mpi3-auto \ --enable-mkl \ CC=icpc MPICXX=mpiicpc @@ -248,8 +243,7 @@ We recommend four ranks per node for best performance, but optimum is local volu The following configuration is recommended for the Intel Haswell platform: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX2 \ +../configure --enable-simd=AVX2 \ --enable-comms=mpi3-auto \ --enable-mkl \ CXX=icpc MPICXX=mpiicpc @@ -266,8 +260,7 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX2 \ +../configure --enable-simd=AVX2 \ --enable-comms=mpi3 \ --enable-mkl \ CXX=CC CC=cc @@ -284,8 +277,7 @@ This is the default. The following configuration is recommended for the Intel Skylake platform: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX512 \ +../configure --enable-simd=AVX512 \ --enable-comms=mpi3 \ --enable-mkl \ CXX=mpiicpc @@ -302,8 +294,7 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX512 \ +../configure --enable-simd=AVX512 \ --enable-comms=mpi3 \ --enable-mkl \ CXX=CC CC=cc @@ -334,8 +325,7 @@ and 8 threads per rank. The following configuration is recommended for the AMD EPYC platform. ``` bash -../configure --enable-precision=double\ - --enable-simd=AVX2 \ +../configure --enable-simd=AVX2 \ --enable-comms=mpi3 \ CXX=mpicxx ``` diff --git a/SVE_README.txt b/SVE_README.txt index 0c167c4a..cefec4be 100644 --- a/SVE_README.txt +++ b/SVE_README.txt @@ -12,31 +12,31 @@ module load mpi/openmpi-aarch64 scl enable gcc-toolset-10 bash -../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" +../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++ CC=gcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" * gcc 10.1 prebuild w/ MPI, QPACE4 interactive login scl enable gcc-toolset-10 bash module load mpi/openmpi-aarch64 -../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" +../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi-auto --enable-shm=shmget --enable-openmp CXX=mpicxx CC=mpicc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" ------------------------------------------------------------------------------ * armclang 20.2 (qp4) -../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN" +../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DARMCLANGCOMPAT -DA64FXASM -DDSLASHINTRIN" ------------------------------------------------------------------------------ * gcc 10.0.1 VLA (merlin) -../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static * gcc 10.0.1 fixed-size ACLE (merlin) -../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=g++-10.0.1 CC=gcc-10.0.1 CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN" * gcc 10.0.1 fixed-size ACLE (fjt) w/ MPI @@ -46,34 +46,34 @@ export OMPI_CXX=g++-10.0.1 export MPICH_CC=gcc-10.0.1 export MPICH_CXX=g++-10.0.1 -$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt" +$ ../configure --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -march=armv8-a+sve -msve-vector-bits=512 -fno-gcse -DA64FXFIXEDSIZE -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64 -lrt" -------------------------------------------------------- * armclang 20.0 VLA (merlin) -../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -fno-unroll-loops -mllvm -vectorizer-min-trip-count=2 -march=armv8-a+sve -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static TODO check ARMCLANGCOMPAT * armclang 20.1 VLA (merlin) -../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static +../configure --with-lime=/home/men04359/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN" LDFLAGS=-static GRID_LDFLAGS=-static MPI_CXXLDFLAGS=-static TODO check ARMCLANGCOMPAT * armclang 20.1 VLA (fjt cluster) -../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" +../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp CXX=armclang++ CC=armclang CXXFLAGS="-std=c++11 -mcpu=a64fx -DARMCLANGCOMPAT -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" TODO check ARMCLANGCOMPAT * armclang 20.1 VLA w/MPI (fjt cluster) -../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64" +../configure --with-lime=$HOME/local --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi3 --enable-openmp CXX=mpiFCC CC=mpifcc CXXFLAGS="-std=c++11 -mcpu=a64fx -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU -I/opt/FJSVxtclanga/tcsds-1.2.25/include/mpi/fujitsu -lrt" LDFLAGS="-L/opt/FJSVxtclanga/tcsds-1.2.25/lib64" No ARMCLANGCOMPAT -> still correct ? @@ -81,9 +81,9 @@ No ARMCLANGCOMPAT -> still correct ? * Fujitsu fcc -../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" +../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=none --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=FCC CC=fcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN" * Fujitsu fcc w/ MPI -../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-precision=double --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" +../configure --with-lime=$HOME/grid-a64fx/lime/c-lime --without-hdf5 --enable-gen-simd-width=64 --enable-simd=GEN --enable-comms=mpi --enable-openmp --with-mpfr=/home/users/gre/gre-1/grid-a64fx/mpfr-build/install CXX=mpiFCC CC=mpifcc CXXFLAGS="-Nclang -Kfast -DA64FX -DA64FXASM -DDSLASHINTRIN -DTOFU" diff --git a/TODO b/TODO index f1175560..e23e040d 100644 --- a/TODO +++ b/TODO @@ -1,3 +1,6 @@ +-- comms threads issue?? +-- Part done: Staggered kernel performance on GPU + ========================================================= General ========================================================= @@ -5,28 +8,18 @@ General - Make representations code take Gimpl - Simplify the HMCand remove modules - Lattice_arith - are the mult, mac etc.. still needed after ET engine? -- Lattice_rng -- Lattice_transfer.h -- accelerate A2Autils -- off critical path for HMC +- Lattice_rng - faster local only loop in init +- Audit: accelerate A2Autils -- off critical path for HMC ========================================================= -GPU branch code item work list +GPU work list ========================================================= -* sum_cpu promote to double during summation for increased precisoin. +* sum_cpu promote to double during summation for increased precision. * Introduce sumD & ReduceD * GPU sum is probably better currently. - * Accelerate the cshift & benchmark -* 0) Single GPU -- 128 bit integer table load in GPU code. - - ImprovedStaggered accelerate & measure perf - - Gianluca's changes to Cayley into gpu-port - - Mobius kernel fusion. -- Gianluca? - - Lebesque order reintroduction. StencilView should have pointer to it - - Lebesgue reorder in all kernels - * 3) Comms/NVlink - OpenMP tasks to run comms threads. Experiment with it - Remove explicit openMP in staggered. @@ -35,14 +28,6 @@ GPU branch code item work list - Stencil gather ?? - SIMD dirs in stencil -* 4) ET enhancements -- eval -> scalar ops in ET engine -- coalescedRead, coalescedWrite in expressions. - -* 5) Misc -- Conserved current clean up. -- multLinkProp eliminate - 8) Merge develop and test HMC 9) Gamma tables on GPU; check this. Appear to work, but no idea why. Are these done on CPU? @@ -52,7 +37,7 @@ GPU branch code item work list - Audit NAMESPACE CHANGES - Audit changes ------ +--------- Gianluca's changes - Performance impact of construct in aligned allocator??? --------- @@ -62,6 +47,33 @@ Gianluca's changes ----------------------------- DONE: ----------------------------- +===== +-- Done: Remez X^-1/2 X^-1/2 X = 1 test. + Feed in MdagM^2 as a test and take its sqrt. + Automated test that MdagM invsqrt(MdagM)invsqrt(MdagM) = 1 in HMC for bounds satisfaction. + +-- Done: Sycl Kernels into develop. Compare to existing unroll and just use. +-- Done: sRNG into refresh functions +-- Done: Tuned decomposition on CUDA into develop +-- Done: Sycl friend accessor. Const view attempt via typedef?? + + +* Done 5) Misc +- Conserved current clean up. +- multLinkProp eliminate + +* Done 0) Single GPU +- 128 bit integer table load in GPU code. + - ImprovedStaggered accelerate & measure perf + - Gianluca's changes to Cayley into gpu-port + - Mobius kernel fusion. -- Gianluca? + - Lebesque order reintroduction. StencilView should have pointer to it + - Lebesgue reorder in all kernels + +* 4) ET enhancements +- Done eval -> scalar ops in ET engine +- Done coalescedRead, coalescedWrite in expressions. + ============================================================================================= AUDIT ContractWWVV with respect to develop -- DONE - GPU accelerate EOFA -- DONE @@ -125,23 +137,6 @@ AUDIT ContractWWVV with respect to develop -- DONE - - (4) omp parallel for collapse(n) - - Only (1) has a natural mirror in accelerator_loop - - Nested loop macros get cumbersome made a generic interface for N deep -- - Don't like thread_region and thread_loop_in_region -- - Could replace with - - thread_nested(1, - for { - - } - ); - thread_nested(2, - for (){ - for (){ - - } - } - ); - - and same "in_region". ----------------------------- diff --git a/benchmarks/Benchmark_IO.cc b/benchmarks/Benchmark_IO.cc index c8c0937f..87e7224d 100644 --- a/benchmarks/Benchmark_IO.cc +++ b/benchmarks/Benchmark_IO.cc @@ -1,10 +1,18 @@ - #include "Benchmark_IO.hpp" -#ifndef BENCH_IO_LMAX -#define BENCH_IO_LMAX 40 +#ifndef BENCH_IO_LMIN +#define BENCH_IO_LMIN 8 #endif +#ifndef BENCH_IO_LMAX +#define BENCH_IO_LMAX 32 +#endif + +#ifndef BENCH_IO_NPASS +#define BENCH_IO_NPASS 10 +#endif + +#ifdef HAVE_LIME using namespace Grid; std::string filestem(const int l) @@ -12,37 +20,182 @@ std::string filestem(const int l) return "iobench_l" + std::to_string(l); } +int vol(const int i) +{ + return BENCH_IO_LMIN + 2*i; +} + +int volInd(const int l) +{ + return (l - BENCH_IO_LMIN)/2; +} + +template +void stats(Mat &mean, Mat &stdDev, const std::vector &data) +{ + auto nr = data[0].rows(), nc = data[0].cols(); + Eigen::MatrixXd sqSum(nr, nc); + double n = static_cast(data.size()); + + assert(n > 1.); + mean = Mat::Zero(nr, nc); + sqSum = Mat::Zero(nr, nc); + for (auto &d: data) + { + mean += d; + sqSum += d.cwiseProduct(d); + } + stdDev = ((sqSum - mean.cwiseProduct(mean)/n)/(n - 1.)).cwiseSqrt(); + mean /= n; +} + +#define grid_printf(...) \ +{\ + char _buf[1024];\ + sprintf(_buf, __VA_ARGS__);\ + MSG << _buf;\ +} + +enum {sRead = 0, sWrite = 1, gRead = 2, gWrite = 3}; + int main (int argc, char ** argv) { -#ifdef HAVE_LIME Grid_init(&argc,&argv); - int64_t threads = GridThread::GetThreads(); + int64_t threads = GridThread::GetThreads(); + auto mpi = GridDefaultMpi(); + unsigned int nVol = (BENCH_IO_LMAX - BENCH_IO_LMIN)/2 + 1; + unsigned int nRelVol = (BENCH_IO_LMAX - 24)/2 + 1; + std::vector perf(BENCH_IO_NPASS, Eigen::MatrixXd::Zero(nVol, 4)); + std::vector avPerf(BENCH_IO_NPASS, Eigen::VectorXd::Zero(4)); + std::vector latt; + MSG << "Grid is setup to use " << threads << " threads" << std::endl; - MSG << SEP << std::endl; - MSG << "Benchmark Lime write" << std::endl; - MSG << SEP << std::endl; - for (int l = 4; l <= BENCH_IO_LMAX; l += 2) + MSG << "MPI partition " << mpi << std::endl; + for (unsigned int i = 0; i < BENCH_IO_NPASS; ++i) { - auto mpi = GridDefaultMpi(); - std::vector latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + MSG << BIGSEP << std::endl; + MSG << "Pass " << i + 1 << "/" << BENCH_IO_NPASS << std::endl; + MSG << BIGSEP << std::endl; + MSG << SEP << std::endl; + MSG << "Benchmark std write" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; - std::cout << "-- Local volume " << l << "^4" << std::endl; - writeBenchmark(latt, filestem(l), limeWrite); + MSG << "-- Local volume " << l << "^4" << std::endl; + writeBenchmark(latt, filestem(l), stdWrite); + perf[i](volInd(l), sWrite) = BinaryIO::lastPerf.mbytesPerSecond; + } + + MSG << SEP << std::endl; + MSG << "Benchmark std read" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + + MSG << "-- Local volume " << l << "^4" << std::endl; + readBenchmark(latt, filestem(l), stdRead); + perf[i](volInd(l), sRead) = BinaryIO::lastPerf.mbytesPerSecond; + } + + #ifdef HAVE_LIME + MSG << SEP << std::endl; + MSG << "Benchmark Grid C-Lime write" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + + MSG << "-- Local volume " << l << "^4" << std::endl; + writeBenchmark(latt, filestem(l), limeWrite); + perf[i](volInd(l), gWrite) = BinaryIO::lastPerf.mbytesPerSecond; + } + + MSG << SEP << std::endl; + MSG << "Benchmark Grid C-Lime read" << std::endl; + MSG << SEP << std::endl; + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + + MSG << "-- Local volume " << l << "^4" << std::endl; + readBenchmark(latt, filestem(l), limeRead); + perf[i](volInd(l), gRead) = BinaryIO::lastPerf.mbytesPerSecond; + } +#endif + avPerf[i].fill(0.); + for (int f = 0; f < 4; ++f) + for (int l = 24; l <= BENCH_IO_LMAX; l += 2) + { + avPerf[i](f) += perf[i](volInd(l), f); + } + avPerf[i] /= nRelVol; } - MSG << "Benchmark Lime read" << std::endl; - MSG << SEP << std::endl; - for (int l = 4; l <= BENCH_IO_LMAX; l += 2) - { - auto mpi = GridDefaultMpi(); - std::vector latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]}; + Eigen::MatrixXd mean(nVol, 4), stdDev(nVol, 4), rob(nVol, 4); + Eigen::VectorXd avMean(4), avStdDev(4), avRob(4); + double n = BENCH_IO_NPASS; - std::cout << "-- Local volume " << l << "^4" << std::endl; - readBenchmark(latt, filestem(l), limeRead); + stats(mean, stdDev, perf); + stats(avMean, avStdDev, avPerf); + rob.fill(100.); + rob -= 100.*stdDev.cwiseQuotient(mean.cwiseAbs()); + avRob.fill(100.); + avRob -= 100.*avStdDev.cwiseQuotient(avMean.cwiseAbs()); + + MSG << BIGSEP << std::endl; + MSG << "SUMMARY" << std::endl; + MSG << BIGSEP << std::endl; + MSG << "Summary of individual results (all results in MB/s)." << std::endl; + MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; + MSG << std::endl; + grid_printf("%4s %12s %12s %12s %12s %12s %12s %12s %12s\n", + "L", "std read", "std dev", "std write", "std dev", + "Grid read", "std dev", "Grid write", "std dev"); + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + grid_printf("%4d %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", + l, mean(volInd(l), sRead), stdDev(volInd(l), sRead), + mean(volInd(l), sWrite), stdDev(volInd(l), sWrite), + mean(volInd(l), gRead), stdDev(volInd(l), gRead), + mean(volInd(l), gWrite), stdDev(volInd(l), gWrite)); } + MSG << std::endl; + MSG << "Robustness of individual results, in \%. (rob = 100\% - std dev / mean)" << std::endl; + MSG << std::endl; + grid_printf("%4s %12s %12s %12s %12s\n", + "L", "std read", "std write", "Grid read", "Grid write"); + for (int l = BENCH_IO_LMIN; l <= BENCH_IO_LMAX; l += 2) + { + grid_printf("%4d %12.1f %12.1f %12.1f %12.1f\n", + l, rob(volInd(l), sRead), rob(volInd(l), sWrite), + rob(volInd(l), gRead), rob(volInd(l), gWrite)); + } + MSG << std::endl; + MSG << "Summary of results averaged over local volumes 24^4-" << BENCH_IO_LMAX << "^4 (all results in MB/s)." << std::endl; + MSG << "Every second colum gives the standard deviation of the previous column." << std::endl; + MSG << std::endl; + grid_printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", + "std read", "std dev", "std write", "std dev", + "Grid read", "std dev", "Grid write", "std dev"); + grid_printf("%12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f %12.1f\n", + avMean(sRead), avStdDev(sRead), avMean(sWrite), avStdDev(sWrite), + avMean(gRead), avStdDev(gRead), avMean(gWrite), avStdDev(gWrite)); + MSG << std::endl; + MSG << "Robustness of volume-averaged results, in \%. (rob = 100\% - std dev / mean)" << std::endl; + MSG << std::endl; + grid_printf("%12s %12s %12s %12s\n", + "std read", "std write", "Grid read", "Grid write"); + grid_printf("%12.1f %12.1f %12.1f %12.1f\n", + avRob(sRead), avRob(sWrite), avRob(gRead), avRob(gWrite)); Grid_finalize(); -#endif + return EXIT_SUCCESS; } +#else +int main(int argc,char ** argv){} +#endif diff --git a/benchmarks/Benchmark_IO.hpp b/benchmarks/Benchmark_IO.hpp index d3416353..2ff42d52 100644 --- a/benchmarks/Benchmark_IO.hpp +++ b/benchmarks/Benchmark_IO.hpp @@ -2,10 +2,12 @@ #define Benchmark_IO_hpp_ #include -#ifdef HAVE_LIME #define MSG std::cout << GridLogMessage #define SEP \ +"-----------------------------------------------------------------------------" +#define BIGSEP \ "=============================================================================" +#ifdef HAVE_LIME namespace Grid { @@ -14,13 +16,152 @@ using WriterFn = std::function ; template using ReaderFn = std::function; +// AP 06/10/2020: Standard C version in case one is suspicious of the C++ API +// +// template +// void stdWrite(const std::string filestem, Field &vec) +// { +// std::string rankStr = std::to_string(vec.Grid()->ThisRank()); +// std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "wb"); +// size_t size; +// uint32_t crc; +// GridStopWatch ioWatch, crcWatch; + +// size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); +// autoView(vec_v, vec, CpuRead); +// crcWatch.Start(); +// crc = GridChecksum::crc32(vec_v.cpu_ptr, size); +// std::fwrite(&crc, sizeof(uint32_t), 1, file); +// crcWatch.Stop(); +// MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl; +// ioWatch.Start(); +// std::fwrite(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); +// ioWatch.Stop(); +// std::fclose(file); +// size *= vec.Grid()->ProcessorCount(); +// auto &p = BinaryIO::lastPerf; +// p.size = size; +// p.time = ioWatch.useconds(); +// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); +// MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() +// << ", " << p.mbytesPerSecond << " MB/s" << std::endl; +// MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; +// } +// +// template +// void stdRead(Field &vec, const std::string filestem) +// { +// std::string rankStr = std::to_string(vec.Grid()->ThisRank()); +// std::FILE *file = std::fopen((filestem + "." + rankStr + ".bin").c_str(), "rb"); +// size_t size; +// uint32_t crcRead, crcData; +// GridStopWatch ioWatch, crcWatch; + +// size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); +// crcWatch.Start(); +// std::fread(&crcRead, sizeof(uint32_t), 1, file); +// crcWatch.Stop(); +// { +// autoView(vec_v, vec, CpuWrite); +// ioWatch.Start(); +// std::fread(vec_v.cpu_ptr, sizeof(typename Field::scalar_object), vec.Grid()->lSites(), file); +// ioWatch.Stop(); +// std::fclose(file); +// } +// { +// autoView(vec_v, vec, CpuRead); +// crcWatch.Start(); +// crcData = GridChecksum::crc32(vec_v.cpu_ptr, size); +// crcWatch.Stop(); +// } +// MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl; +// assert(crcData == crcRead); +// size *= vec.Grid()->ProcessorCount(); +// auto &p = BinaryIO::lastPerf; +// p.size = size; +// p.time = ioWatch.useconds(); +// p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); +// MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() +// << ", " << p.mbytesPerSecond << " MB/s" << std::endl; +// MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; +// } + +template +void stdWrite(const std::string filestem, Field &vec) +{ + std::string rankStr = std::to_string(vec.Grid()->ThisRank()); + std::ofstream file(filestem + "." + rankStr + ".bin", std::ios::out | std::ios::binary); + size_t size, sizec; + uint32_t crc; + GridStopWatch ioWatch, crcWatch; + + size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); + sizec = size/sizeof(char); // just in case of... + autoView(vec_v, vec, CpuRead); + crcWatch.Start(); + crc = GridChecksum::crc32(vec_v.cpu_ptr, size); + file.write(reinterpret_cast(&crc), sizeof(uint32_t)/sizeof(char)); + crcWatch.Stop(); + MSG << "Std I/O write: Data CRC32 " << std::hex << crc << std::dec << std::endl; + ioWatch.Start(); + file.write(reinterpret_cast(vec_v.cpu_ptr), sizec); + file.flush(); + ioWatch.Stop(); + size *= vec.Grid()->ProcessorCount(); + auto &p = BinaryIO::lastPerf; + p.size = size; + p.time = ioWatch.useconds(); + p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); + MSG << "Std I/O write: Wrote " << p.size << " bytes in " << ioWatch.Elapsed() + << ", " << p.mbytesPerSecond << " MB/s" << std::endl; + MSG << "Std I/O write: checksum overhead " << crcWatch.Elapsed() << std::endl; +} + +template +void stdRead(Field &vec, const std::string filestem) +{ + std::string rankStr = std::to_string(vec.Grid()->ThisRank()); + std::ifstream file(filestem + "." + rankStr + ".bin", std::ios::in | std::ios::binary); + size_t size, sizec; + uint32_t crcRead, crcData; + GridStopWatch ioWatch, crcWatch; + + size = vec.Grid()->lSites()*sizeof(typename Field::scalar_object); + sizec = size/sizeof(char); // just in case of... + crcWatch.Start(); + file.read(reinterpret_cast(&crcRead), sizeof(uint32_t)/sizeof(char)); + crcWatch.Stop(); + { + autoView(vec_v, vec, CpuWrite); + ioWatch.Start(); + file.read(reinterpret_cast(vec_v.cpu_ptr), sizec); + ioWatch.Stop(); + } + { + autoView(vec_v, vec, CpuRead); + crcWatch.Start(); + crcData = GridChecksum::crc32(vec_v.cpu_ptr, size); + crcWatch.Stop(); + } + MSG << "Std I/O read: Data CRC32 " << std::hex << crcData << std::dec << std::endl; + assert(crcData == crcRead); + size *= vec.Grid()->ProcessorCount(); + auto &p = BinaryIO::lastPerf; + p.size = size; + p.time = ioWatch.useconds(); + p.mbytesPerSecond = size/1024./1024./(ioWatch.useconds()/1.e6); + MSG << "Std I/O read: Read " << p.size << " bytes in " << ioWatch.Elapsed() + << ", " << p.mbytesPerSecond << " MB/s" << std::endl; + MSG << "Std I/O read: checksum overhead " << crcWatch.Elapsed() << std::endl; +} + template void limeWrite(const std::string filestem, Field &vec) { emptyUserRecord record; ScidacWriter binWriter(vec.Grid()->IsBoss()); - binWriter.open(filestem + ".bin"); + binWriter.open(filestem + ".lime.bin"); binWriter.writeScidacFieldRecord(vec, record); binWriter.close(); } @@ -31,7 +172,7 @@ void limeRead(Field &vec, const std::string filestem) emptyUserRecord record; ScidacReader binReader; - binReader.open(filestem + ".bin"); + binReader.open(filestem + ".lime.bin"); binReader.readScidacFieldRecord(vec, record); binReader.close(); } @@ -73,12 +214,18 @@ void writeBenchmark(const Coordinate &latt, const std::string filename, auto simd = GridDefaultSimd(latt.size(), Field::vector_type::Nsimd()); std::shared_ptr gBasePt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi)); std::shared_ptr gPt; + std::random_device rd; makeGrid(gPt, gBasePt, Ls, rb); - GridBase *g = gPt.get(); - GridParallelRNG rng(g); - Field vec(g); + GridBase *g = gPt.get(); + GridParallelRNG rng(g); + Field vec(g); + + rng.SeedFixedIntegers({static_cast(rd()), static_cast(rd()), + static_cast(rd()), static_cast(rd()), + static_cast(rd()), static_cast(rd()), + static_cast(rd()), static_cast(rd())}); random(rng, vec); write(filename, vec); @@ -96,8 +243,8 @@ void readBenchmark(const Coordinate &latt, const std::string filename, makeGrid(gPt, gBasePt, Ls, rb); - GridBase *g = gPt.get(); - Field vec(g); + GridBase *g = gPt.get(); + Field vec(g); read(vec, filename); } diff --git a/benchmarks/Benchmark_IO_vs_dir.cc b/benchmarks/Benchmark_IO_vs_dir.cc index 6e6c9ae0..8252547b 100644 --- a/benchmarks/Benchmark_IO_vs_dir.cc +++ b/benchmarks/Benchmark_IO_vs_dir.cc @@ -1,14 +1,9 @@ #include "Benchmark_IO.hpp" - -#define MSG std::cout << GridLogMessage -#define SEP \ -"=============================================================================" - +#ifdef HAVE_LIME using namespace Grid; int main (int argc, char ** argv) { -#ifdef HAVE_LIME std::vector dir; unsigned int Ls; bool rb; @@ -34,46 +29,74 @@ int main (int argc, char ** argv) } Grid_init(&argc,&argv); - int64_t threads = GridThread::GetThreads(); + auto mpi = GridDefaultMpi(); + MSG << "Grid is setup to use " << threads << " threads" << std::endl; - MSG << SEP << std::endl; - MSG << "Benchmark double precision Lime write" << std::endl; - MSG << SEP << std::endl; - for (auto &d: dir) - { - MSG << "-- Directory " << d << std::endl; - writeBenchmark(GridDefaultLatt(), d + "/ioBench", limeWrite, Ls, rb); - } + MSG << "MPI partition " << mpi << std::endl; MSG << SEP << std::endl; - MSG << "Benchmark double precision Lime read" << std::endl; + MSG << "Benchmark Grid std write" << std::endl; MSG << SEP << std::endl; for (auto &d: dir) { MSG << "-- Directory " << d << std::endl; - readBenchmark(GridDefaultLatt(), d + "/ioBench", limeRead, Ls, rb); + writeBenchmark(GridDefaultLatt(), d + "/ioBench", + stdWrite, Ls, rb); + } + MSG << SEP << std::endl; + MSG << "Benchmark Grid std read" << std::endl; + MSG << SEP << std::endl; + for (auto &d: dir) + { + MSG << "-- Directory " << d << std::endl; + readBenchmark(GridDefaultLatt(), d + "/ioBench", + stdRead, Ls, rb); } +#ifdef HAVE_LIME MSG << SEP << std::endl; - MSG << "Benchmark single precision Lime write" << std::endl; + MSG << "Benchmark Grid C-Lime write" << std::endl; MSG << SEP << std::endl; for (auto &d: dir) { MSG << "-- Directory " << d << std::endl; - writeBenchmark(GridDefaultLatt(), d + "/ioBench", limeWrite, Ls, rb); + writeBenchmark(GridDefaultLatt(), d + "/ioBench", + limeWrite, Ls, rb); } + MSG << SEP << std::endl; + MSG << "Benchmark Grid C-Lime read" << std::endl; + MSG << SEP << std::endl; + for (auto &d: dir) + { + MSG << "-- Directory " << d << std::endl; + readBenchmark(GridDefaultLatt(), d + "/ioBench", + limeRead, Ls, rb); + } +#endif - MSG << SEP << std::endl; - MSG << "Benchmark single precision Lime read" << std::endl; - MSG << SEP << std::endl; - for (auto &d: dir) - { - MSG << "-- Directory " << d << std::endl; - readBenchmark(GridDefaultLatt(), d + "/ioBench", limeRead, Ls, rb); - } + // MSG << SEP << std::endl; + // MSG << "Benchmark single precision Lime write" << std::endl; + // MSG << SEP << std::endl; + // for (auto &d: dir) + // { + // MSG << "-- Directory " << d << std::endl; + // writeBenchmark(GridDefaultLatt(), d + "/ioBench", limeWrite, Ls, rb); + // } + + // MSG << SEP << std::endl; + // MSG << "Benchmark single precision Lime read" << std::endl; + // MSG << SEP << std::endl; + // for (auto &d: dir) + // { + // MSG << "-- Directory " << d << std::endl; + // readBenchmark(GridDefaultLatt(), d + "/ioBench", limeRead, Ls, rb); + // } Grid_finalize(); -#endif + return EXIT_SUCCESS; } +#else +int main(int argc,char ** argv){} +#endif diff --git a/benchmarks/Benchmark_ITT.cc b/benchmarks/Benchmark_ITT.cc index dc09549c..81d1acd4 100644 --- a/benchmarks/Benchmark_ITT.cc +++ b/benchmarks/Benchmark_ITT.cc @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -62,7 +62,7 @@ struct time_statistics{ void comms_header(){ std::cout < xbuf(8); std::vector rbuf(8); - Grid.ShmBufferFreeAll(); + //Grid.ShmBufferFreeAll(); + uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); for(int d=0;d<8;d++){ - xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); - rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); - bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); - bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + // bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); + // bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); } - int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); int ncomm; double dbytes; - std::vector times(Nloop); - for(int i=0;i1 ) { - dbytes=0; - ncomm=0; + std::vector times(Nloop); + for(int i=0;i1 ) { - + dbytes=0; + double start=usecond(); int xmit_to_rank; int recv_from_rank; + if ( dir == mu ) { int comm_proc=1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); @@ -168,40 +164,40 @@ public: int comm_proc = mpi_layout[mu]-1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); } - tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, - (void *)&rbuf[dir][0], recv_from_rank, - bytes,dir); - thread_critical { - ncomm++; - dbytes+=tbytes; - } + Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, + (void *)&rbuf[dir][0], recv_from_rank, + bytes); + dbytes+=bytes; + + double stop=usecond(); + t_time[i] = stop-start; // microseconds + } - }); - Grid.Barrier(); - double stop=usecond(); - t_time[i] = stop-start; // microseconds + timestat.statistics(t_time); + + dbytes=dbytes*ppn; + double xbytes = dbytes*0.5; + double bidibytes = dbytes; + + std::cout<({45,12,81,9})); for(int lat=8;lat<=lmax;lat+=8){ @@ -247,11 +243,6 @@ public: double start=usecond(); for(int i=0;i > LatticeSU4; + + Coordinate simd_layout = GridDefaultSimd(Nd,vComplexF::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + + std::cout<({45,12,81,9})); + for(int lat=8;lat<=lmax;lat+=8){ + + Coordinate latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); + int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + NN =Grid.NodeCount(); + + + LatticeSU4 z(&Grid); z=Zero(); + LatticeSU4 x(&Grid); x=Zero(); + LatticeSU4 y(&Grid); y=Zero(); + double a=2.0; + + uint64_t Nloop=NLOOP; + + double start=usecond(); + for(int i=0;iRankCount(); @@ -291,11 +337,11 @@ public: NN_global=NN; uint64_t SHM=NP/NN; - Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); ///////// Welcome message //////////// std::cout<::HotConfiguration(RNG4,Umu); Fermion src (FGrid); random(RNG5,src); Fermion src_e (FrbGrid); Fermion src_o (FrbGrid); @@ -369,7 +415,7 @@ public: } FGrid->Barrier(); double t1=usecond(); - uint64_t ncall = 50; + uint64_t ncall = 500; FGrid->Broadcast(0,&ncall,sizeof(ncall)); @@ -387,7 +433,17 @@ public: FGrid->Barrier(); double volume=Ls; for(int mu=0;mumflops_best ) mflops_best = mflops; if ( mflopsRankCount(); @@ -447,7 +505,6 @@ public: NN_global=NN; uint64_t SHM=NP/NN; - Coordinate latt4({local[0]*mpi[0],local[1]*mpi[1],local[2]*mpi[2],local[3]*mpi[3]}); ///////// Welcome message //////////// std::cout<::HotConfiguration(RNG4,Umu); typename Action::ImplParams params; Action Ds(Umu,Umu,*FGrid,*FrbGrid,mass,c1,c2,u0,params); @@ -596,11 +653,12 @@ int main (int argc, char ** argv) #endif Benchmark::Decomposition(); + int do_su4=1; int do_memory=1; int do_comms =1; - int sel=2; - std::vector L_list({16,24,32}); + int sel=4; + std::vector L_list({8,12,16,24,32}); int selm1=sel-1; std::vector wilson; @@ -624,7 +682,6 @@ int main (int argc, char ** argv) dwf4.push_back(result); } - /* std::cout<1) ) { + if ( do_su4 ) { + std::cout< > xbuf(8); - std::vector > rbuf(8); + std::vector > xbuf(8); + std::vector > rbuf(8); for(int mu=0;mu<8;mu++){ xbuf[mu].resize(lat*lat*lat*Ls); diff --git a/benchmarks/Benchmark_comms_host_device.cc b/benchmarks/Benchmark_comms_host_device.cc new file mode 100644 index 00000000..591b5597 --- /dev/null +++ b/benchmarks/Benchmark_comms_host_device.cc @@ -0,0 +1,260 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./benchmarks/Benchmark_comms.cc + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +struct time_statistics{ + double mean; + double err; + double min; + double max; + + void statistics(std::vector v){ + double sum = std::accumulate(v.begin(), v.end(), 0.0); + mean = sum / v.size(); + + std::vector diff(v.size()); + std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); + double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); + err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); + + auto result = std::minmax_element(v.begin(), v.end()); + min = *result.first; + max = *result.second; +} +}; + +void header(){ + std::cout <1) nmu++; + + std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; + std::vector t_time(Nloop); + time_statistics timestat; + + std::cout< > xbuf(8); + std::vector > rbuf(8); + + for(int mu=0;mu<8;mu++){ + xbuf[mu].resize(lat*lat*lat*Ls); + rbuf[mu].resize(lat*lat*lat*Ls); + } + uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + + int ncomm; + + for(int mu=0;mu<4;mu++){ + if (mpi_layout[mu]>1 ) { + double start=usecond(); + for(int i=0;i requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); + } + + comm_proc = mpi_layout[mu]-1; + { + std::vector requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); + } + } + Grid.Barrier(); + double stop=usecond(); + double mean=(stop-start)/Nloop; + double dbytes = bytes*ppn; + double xbytes = dbytes*2.0*ncomm; + double rbytes = xbytes; + double bidibytes = xbytes+rbytes; + + std::cout< xbuf(8); + std::vector rbuf(8); + + uint64_t bytes = lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); + for(int d=0;d<8;d++){ + xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes); + } + + int ncomm; + + for(int mu=0;mu<4;mu++){ + if (mpi_layout[mu]>1 ) { + double start=usecond(); + for(int i=0;i requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu][0], + xmit_to_rank, + (void *)&rbuf[mu][0], + recv_from_rank, + bytes); + } + + comm_proc = mpi_layout[mu]-1; + { + std::vector requests; + Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); + Grid.SendToRecvFrom((void *)&xbuf[mu+4][0], + xmit_to_rank, + (void *)&rbuf[mu+4][0], + recv_from_rank, + bytes); + } + } + Grid.Barrier(); + double stop=usecond(); + double mean=(stop-start)/Nloop; + double dbytes = bytes*ppn; + double xbytes = dbytes*2.0*ncomm; + double rbytes = xbytes; + double bidibytes = xbytes+rbytes; + + std::cout<::HotConfiguration(RNG4,Umu); std::cout << GridLogMessage << "Random gauge initialised " << std::endl; #if 0 Umu=1.0; diff --git a/benchmarks/Benchmark_dwf_fp32.cc b/benchmarks/Benchmark_dwf_fp32.cc new file mode 100644 index 00000000..03f3ee61 --- /dev/null +++ b/benchmarks/Benchmark_dwf_fp32.cc @@ -0,0 +1,364 @@ + /************************************************************************************* + Grid physics library, www.github.com/paboyle/Grid + Source file: ./benchmarks/Benchmark_dwf.cc + Copyright (C) 2015 + + Author: Peter Boyle + Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#ifdef GRID_CUDA +#define CUDA_PROFILE +#endif + +#ifdef CUDA_PROFILE +#include +#endif + +using namespace std; +using namespace Grid; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + + int threads = GridThread::GetThreads(); + + Coordinate latt4 = GridDefaultLatt(); + int Ls=16; + for(int i=0;i> Ls; + } + + GridLogLayout(); + + long unsigned int single_site_flops = 8*Nc*(7+16*Nc); + + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi()); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making s innermost grids"< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionF src (FGrid); random(RNG5,src); +#if 0 + src = Zero(); + { + Coordinate origin({0,0,0,latt4[2]-1,0}); + SpinColourVectorF tmp; + tmp=Zero(); + tmp()(0)(0)=Complex(-2.0,0.0); + std::cout << " source site 0 " << tmp<::HotConfiguration(RNG4,Umu); + std::cout << GridLogMessage << "Random gauge initialised " << std::endl; +#if 0 + Umu=1.0; + for(int mu=0;mu(Umu,mu); + // if (mu !=2 ) ttmp = 0; + // ttmp = ttmp* pow(10.0,mu); + PokeIndex(Umu,ttmp,mu); + } + std::cout << GridLogMessage << "Forced to diagonal " << std::endl; +#endif + + //////////////////////////////////// + // Naive wilson implementation + //////////////////////////////////// + // replicate across fifth dimension + LatticeGaugeFieldF Umu5d(FGrid); + std::vector U(4,FGrid); + { + autoView( Umu5d_v, Umu5d, CpuWrite); + autoView( Umu_v , Umu , CpuRead); + for(int ss=0;ssoSites();ss++){ + for(int s=0;s(Umu5d,mu); + } + std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; + + if (1) + { + ref = Zero(); + for(int mu=0;mu_Nprocessors; + RealD NN = UGrid->NodeCount(); + + std::cout << GridLogMessage<< "*****************************************************************" <Barrier(); + Dw.ZeroCounters(); + Dw.Dhop(src,result,0); + std::cout<Barrier(); + + double volume=Ls; for(int mu=0;mu1.0e-4) ) { + /* + std::cout << "RESULT\n " << result<Barrier(); + exit(-1); + } + assert (norm2(err)< 1.0e-4 ); + Dw.Report(); + } + + if (1) + { // Naive wilson dag implementation + ref = Zero(); + for(int mu=0;mu1.0e-4)){ +/* + std::cout<< "DAG RESULT\n " <Barrier(); + Dw.DhopEO(src_o,r_e,DaggerNo); + double t0=usecond(); + for(int i=0;iBarrier(); + + double volume=Ls; for(int mu=0;mu1.0e-4)){ + /* + std::cout<< "Deo RESULT\n " <::HotConfiguration(RNG4,Umu); std::cout << GridLogMessage << "Random gauge initialised " << std::endl; RealD mass=0.1; @@ -184,7 +184,7 @@ int main (int argc, char ** argv) std::cout<::HotConfiguration(RNG4,Umu); std::cout << GridLogMessage << "made random gauge fields"< -where:: - - --enable-precision=single|double - -sets the **default precision**. Since this is largely a benchmarking convenience, it is anticipated that the default precision may be removed in future implementations, -and that explicit type selection be made at all points. Naturally, most code will be type templated in any case.:: +:: --enable-simd=GEN|SSE4|AVX|AVXFMA|AVXFMA4|AVX2|AVX512|NEONv8|QPX @@ -236,7 +231,7 @@ Detailed build configuration options --enable-mkl[=path] use Intel MKL for FFT (and LAPACK if enabled) routines. A UNIX prefix containing the library can be specified (optional). --enable-simd=code setup Grid for the SIMD target ``(default: `GEN`). A list of possible SIMD targets is detailed in a section below. --enable-gen-simd-width=size select the size (in bytes) of the generic SIMD vector type (default: 32 bytes). E.g. SSE 128 bit corresponds to 16 bytes. - --enable-precision=single|double set the default precision (default: `double`). + --enable-precision=single|double set the default precision (default: `double`). **Deprecated option** --enable-comms=mpi|none use `` for message passing (default: `none`). --enable-rng=sitmo|ranlux48|mt19937 choose the RNG (default: `sitmo`). --disable-timers disable system dependent high-resolution timers. @@ -304,8 +299,7 @@ Build setup for Intel Knights Landing platform The following configuration is recommended for the Intel Knights Landing platform:: - ../configure --enable-precision=double\ - --enable-simd=KNL \ + ../configure --enable-simd=KNL \ --enable-comms=mpi-auto \ --enable-mkl \ CXX=icpc MPICXX=mpiicpc @@ -314,8 +308,7 @@ The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:: - ../configure --enable-precision=double\ - --enable-simd=KNL \ + ../configure --enable-simd=KNL \ --enable-comms=mpi \ --enable-mkl \ CXX=CC CC=cc @@ -332,8 +325,7 @@ presently performs better with use of more than one rank per node, using shared for interior communication. We recommend four ranks per node for best performance, but optimum is local volume dependent. :: - ../configure --enable-precision=double\ - --enable-simd=KNL \ + ../configure --enable-simd=KNL \ --enable-comms=mpi-auto \ --enable-mkl \ CC=icpc MPICXX=mpiicpc @@ -343,8 +335,7 @@ Build setup for Intel Haswell Xeon platform The following configuration is recommended for the Intel Haswell platform:: - ../configure --enable-precision=double\ - --enable-simd=AVX2 \ + ../configure --enable-simd=AVX2 \ --enable-comms=mpi-auto \ --enable-mkl \ CXX=icpc MPICXX=mpiicpc @@ -360,8 +351,7 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:: - ../configure --enable-precision=double\ - --enable-simd=AVX2 \ + ../configure --enable-simd=AVX2 \ --enable-comms=mpi \ --enable-mkl \ CXX=CC CC=cc @@ -379,8 +369,7 @@ Build setup for Intel Skylake Xeon platform The following configuration is recommended for the Intel Skylake platform:: - ../configure --enable-precision=double\ - --enable-simd=AVX512 \ + ../configure --enable-simd=AVX512 \ --enable-comms=mpi \ --enable-mkl \ CXX=mpiicpc @@ -396,8 +385,7 @@ where `` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:: - ../configure --enable-precision=double\ - --enable-simd=AVX512 \ + ../configure --enable-simd=AVX512 \ --enable-comms=mpi \ --enable-mkl \ CXX=CC CC=cc @@ -422,8 +410,7 @@ and 8 threads per rank. The following configuration is recommended for the AMD EPYC platform:: - ../configure --enable-precision=double\ - --enable-simd=AVX2 \ + ../configure --enable-simd=AVX2 \ --enable-comms=mpi \ CXX=mpicxx diff --git a/examples/Example_Laplacian.cc b/examples/Example_Laplacian.cc new file mode 100644 index 00000000..fa8466cf --- /dev/null +++ b/examples/Example_Laplacian.cc @@ -0,0 +1,396 @@ +#include +using namespace Grid; + +/* +///////////////////////////////////////////////////////////////////////////////////////////// +// Grid/algorithms/SparseMatrix.h: Interface defining what I expect of a general sparse matrix, such as a Fermion action +///////////////////////////////////////////////////////////////////////////////////////////// +template class SparseMatrixBase { +public: + virtual GridBase *Grid(void) =0; + + virtual void M (const Field &in, Field &out)=0; + virtual void Mdag (const Field &in, Field &out)=0; + virtual void MdagM(const Field &in, Field &out) { + Field tmp (in.Grid()); + M(in,tmp); + Mdag(tmp,out); + } + virtual void Mdiag (const Field &in, Field &out)=0; + virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; + virtual void MdirAll (const Field &in, std::vector &out)=0; +}; +*/ + +const std::vector directions ({Xdir,Ydir,Zdir,Xdir,Ydir,Zdir}); +const std::vector displacements({1,1,1,-1,-1,-1}); + +template class FreeLaplacianCshift : public SparseMatrixBase +{ +public: + GridBase *grid; + FreeLaplacianCshift(GridBase *_grid) + { + grid=_grid; + }; + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out = Zero(); + for(int mu=0;mu &out) {assert(0);}; // Unimplemented need only for multigrid +}; + +template class CovariantLaplacianCshift : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + GridBase *grid; + GaugeField U; + + CovariantLaplacianCshift(GaugeField &_U) : + grid(_U.Grid()), + U(_U) { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out=Zero(); + for(int mu=0;mu(U, mu); // NB: Inefficent + out = out + Gimpl::CovShiftForward(Umu,mu,in); + out = out + Gimpl::CovShiftBackward(Umu,mu,in); + out = out - 2.0*in; + } + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + + +#define LEG_LOAD(Dir) \ + SE = st.GetEntry(ptype, Dir, ss); \ + if (SE->_is_local ) { \ + int perm= SE->_permute; \ + chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \ + } else { \ + chi = coalescedRead(buf[SE->_offset],lane); \ + } \ + acceleratorSynchronise(); + +template class FreeLaplacianStencil : public SparseMatrixBase +{ +public: + typedef typename Field::vector_object siteObject; + typedef CartesianStencil StencilImpl; + + GridBase *grid; + StencilImpl Stencil; + SimpleCompressor Compressor; + + FreeLaplacianStencil(GridBase *_grid) + : Stencil (_grid,6,Even,directions,displacements,0), grid(_grid) + { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &_in, Field &_out) + { + + /////////////////////////////////////////////// + // Halo exchange for this geometry of stencil + /////////////////////////////////////////////// + Stencil.HaloExchange(_in, Compressor); + + /////////////////////////////////// + // Arithmetic expressions + /////////////////////////////////// + + // Views; device friendly/accessible pointers + auto st = Stencil.View(AcceleratorRead); + auto buf = st.CommBuf(); + autoView( in , _in , AcceleratorRead); + autoView( out , _out , AcceleratorWrite); + + typedef typename Field::vector_object vobj; + typedef decltype(coalescedRead(in[0])) calcObj; + + const int Nsimd = vobj::Nsimd(); + const uint64_t NN = grid->oSites(); + + accelerator_for( ss, NN, Nsimd, { + + StencilEntry *SE; + + const int lane=acceleratorSIMTlane(Nsimd); + + calcObj chi; + calcObj res; + int ptype; + + res = coalescedRead(in[ss])*(-6.0); + LEG_LOAD(0); res = res + chi; + LEG_LOAD(1); res = res + chi; + LEG_LOAD(2); res = res + chi; + LEG_LOAD(3); res = res + chi; + LEG_LOAD(4); res = res + chi; + LEG_LOAD(5); res = res + chi; + + coalescedWrite(out[ss], res,lane); + + }); + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + +template class CovariantLaplacianStencil : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + typedef typename Field::vector_object siteObject; + + template using iImplDoubledGaugeField = iVector >, Nds>; + typedef iImplDoubledGaugeField SiteDoubledGaugeField; + typedef Lattice DoubledGaugeField; + + typedef CartesianStencil StencilImpl; + + GridBase *grid; + StencilImpl Stencil; + SimpleCompressor Compressor; + DoubledGaugeField Uds; + CovariantLaplacianStencil(GaugeField &Umu) + : + grid(Umu.Grid()), + Stencil (grid,6,Even,directions,displacements,0), + Uds(grid) + { + for (int mu = 0; mu < Nd; mu++) { + auto U = PeekIndex(Umu, mu); + PokeIndex(Uds, U, mu ); + U = adj(Cshift(U, mu, -1)); + PokeIndex(Uds, U, mu + 4); + } + }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &_in, Field &_out) + { + /////////////////////////////////////////////// + // Halo exchange for this geometry of stencil + /////////////////////////////////////////////// + Stencil.HaloExchange(_in, Compressor); + + /////////////////////////////////// + // Arithmetic expressions + /////////////////////////////////// + auto st = Stencil.View(AcceleratorRead); + auto buf = st.CommBuf(); + + autoView( in , _in , AcceleratorRead); + autoView( out , _out , AcceleratorWrite); + autoView( U , Uds , AcceleratorRead); + + typedef typename Field::vector_object vobj; + typedef decltype(coalescedRead(in[0])) calcObj; + typedef decltype(coalescedRead(U[0](0))) calcLink; + + const int Nsimd = vobj::Nsimd(); + const uint64_t NN = grid->oSites(); + + accelerator_for( ss, NN, Nsimd, { + + StencilEntry *SE; + + const int lane=acceleratorSIMTlane(Nsimd); + + calcObj chi; + calcObj res; + calcObj Uchi; + calcLink UU; + int ptype; + + res = coalescedRead(in[ss])*(-6.0); + +#define LEG_LOAD_MULT(leg,polarisation) \ + UU = coalescedRead(U[ss](polarisation)); \ + LEG_LOAD(leg); \ + mult(&Uchi(), &UU, &chi()); \ + res = res + Uchi; + + LEG_LOAD_MULT(0,Xp); + LEG_LOAD_MULT(1,Yp); + LEG_LOAD_MULT(2,Zp); + LEG_LOAD_MULT(3,Xm); + LEG_LOAD_MULT(4,Ym); + LEG_LOAD_MULT(5,Zm); + + coalescedWrite(out[ss], res,lane); + }); + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + +#undef LEG_LOAD_MULT +#undef LEG_LOAD + +int main(int argc, char ** argv) +{ + Grid_init(&argc, &argv); + + typedef LatticeColourVector Field; + + auto latt_size = GridDefaultLatt(); + auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + auto mpi_layout = GridDefaultMpi(); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridParallelRNG RNG(&Grid); RNG.SeedFixedIntegers(std::vector({45,12,81,9})); + + FreeLaplacianCshift FLcs(&Grid); + FreeLaplacianStencil FLst(&Grid); + + LatticeGaugeField U(&Grid); + + SU::ColdConfiguration(RNG,U); + + std::cout << " Gauge field has norm " < CLcs(U); + CovariantLaplacianStencil CLst(U); + + Field in(&Grid); gaussian(RNG,in); + Field out_FLcs(&Grid); + Field out_FLst(&Grid); + Field out_CLcs(&Grid); + Field out_CLst(&Grid); + Field diff(&Grid); + + //////////////////////////////////////////////////////// + // First test: in free field these should all agree + //////////////////////////////////////////////////////// + FLcs.M(in,out_FLcs); + FLst.M(in,out_FLst); + CLcs.M(in,out_CLcs); + CLst.M(in,out_CLst); + + std:: cout << "******************************************************************" <::RandomGaugeTransform(RNG,U_GT,g); // Unit gauge + + Field in_GT(&Grid); + Field out_GT(&Grid); + + Field out_CLcs_GT(&Grid); + Field out_CLst_GT(&Grid); + + CovariantLaplacianCshift CLcs_GT(U_GT); + CovariantLaplacianStencil CLst_GT(U_GT); + + in_GT = g*in; + out_GT = g*out_FLcs; + + // Check M^GT_xy in_GT = g(x) M_xy g^dag(y) g(y) in = g(x) out(x) + CLcs_GT.M(in_GT,out_CLcs_GT); + CLst_GT.M(in_GT,out_CLst_GT); + + diff = out_CLcs_GT - out_GT; + std:: cout << " Difference between Gauge xformed result and covariant Cshift Laplacian in xformed gauge = " < dim_mask({1,1,1,0}); // 3d FFT + FFT theFFT(&Grid); + Field out(&Grid); + Field F_out(&Grid); + Field F_in(&Grid); + + // FFT the random input vector + theFFT.FFT_dim_mask(F_in,in,dim_mask,FFT::forward); + + // Convolution theorem: multiply by Fourier representation of (discrete) Laplacian to apply diff op + LatticeComplexD lap(&Grid); lap = Zero(); + LatticeComplexD kmu(&Grid); + ComplexD ci(0.0,1.0); + for(int mu=0;mu<3;mu++) { + + RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; + + LatticeCoordinate(kmu,mu); + kmu = TwoPiL * kmu; + + // (e^ik_mu + e^-ik_mu - 2) = 2( cos kmu - 1) ~ 2 (1 - k_mu^2/2 -1 ) = - k_mu^2 + O(k^4) + lap = lap + 2.0*cos(kmu) - 2.0; + + } + F_out = lap * F_in; + + // Inverse FFT the result + theFFT.FFT_dim_mask(out,F_out,dim_mask,FFT::backward); + + std::cout<<"Fourier xformed (in) "< +using namespace Grid; + +// Function used for Chebyshev smearing +// +Real MomentumSmearing(Real p2) +{ + return (1 - 4.0*p2) * exp(-p2/4); +} +Real DistillationSmearing(Real p2) +{ + if ( p2 > 0.5 ) return 0.0; + else return 1.0; +} + +// Flip sign to make prop to p^2, not -p^2 relative to last example +template class CovariantLaplacianCshift : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + GridBase *grid; + GaugeField U; + + CovariantLaplacianCshift(GaugeField &_U) : + grid(_U.Grid()), + U(_U) { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out=Zero(); + for(int mu=0;mu(U, mu); // NB: Inefficent + out = out - Gimpl::CovShiftForward(Umu,mu,in); + out = out - Gimpl::CovShiftBackward(Umu,mu,in); + out = out + 2.0*in; + } + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + + + +int main(int argc, char ** argv) +{ + Grid_init(&argc, &argv); + + typedef LatticeColourVector Field; + + auto latt_size = GridDefaultLatt(); + auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + auto mpi_layout = GridDefaultMpi(); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridParallelRNG RNG(&Grid); RNG.SeedFixedIntegers(std::vector({45,12,81,9})); + + + LatticeGaugeField U(&Grid); + + SU::ColdConfiguration(RNG,U); + + typedef CovariantLaplacianCshift Laplacian_t; + Laplacian_t Laplacian(U); + + + ColourVector ColourKronecker; + ColourKronecker = Zero(); + ColourKronecker()()(0) = 1.0; + + Coordinate site({latt_size[0]/2, + latt_size[1]/2, + latt_size[2]/2, + 0}); + + Field kronecker(&Grid); + kronecker = Zero(); + pokeSite(ColourKronecker,kronecker,site); + + + Field psi(&Grid), chi(&Grid); + + ////////////////////////////////////// + // Classic Wuppertal smearing + ////////////////////////////////////// + + Integer Iterations = 80; + Real width = 2.0; + Real coeff = (width*width) / Real(4*Iterations); + + chi=kronecker; + // chi = (1-p^2/2N)^N kronecker + for(int n = 0; n < Iterations; ++n) { + Laplacian.M(chi,psi); + chi = chi - coeff*psi; + } + + std::cout << " Wuppertal smeared operator is chi = \n" << chi < HermOp(Laplacian); + + std::cout << " Checking spectral range of our POSITIVE definite operator \n"; + PowerMethod PM; + PM(HermOp,kronecker); + + // Chebyshev ChebySmear(lo,hi,20,DistillationSmearing); + Chebyshev ChebySmear(lo,hi,20,MomentumSmearing); + { + std::ofstream of("chebysmear"); + ChebySmear.csv(of); + } + + ChebySmear(HermOp,kronecker,chi); + + std::cout << " Chebyshev smeared operator is chi = \n" << chi < +using namespace Grid; + +template +void SimpleConjugateGradient(LinearOperatorBase &HPDop,const Field &b, Field &x) +{ + RealD cp, c, alpha, d, beta, ssq, qq; + RealD Tolerance=1.0e-10; + int MaxIterations=10000; + + Field p(b), mmp(b), r(b); + + HPDop.HermOpAndNorm(x, mmp, d, beta); + + r = b - mmp; + p = r; + + cp = alpha = norm2(p); + ssq = norm2(b); + + RealD rsq = Tolerance * Tolerance * ssq; + + for (int k = 1; k <= MaxIterations; k++) { + c = cp; + + HPDop.HermOp(p, mmp); + + d = real(innerProduct(p,mmp)); + + alpha = c / d; + + r = r - alpha *mmp; + cp = norm2(r); + beta = cp / c; + + x = x + alpha* p ; + p = r + beta* p ; + + std::cout << "iteration "< class CovariantLaplacianCshift : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + GridBase *grid; + GaugeField U; + RealD m2=1.0e-2; + CovariantLaplacianCshift(GaugeField &_U) : + grid(_U.Grid()), + U(_U) { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out=Zero(); + for(int mu=0;mu(U, mu); // NB: Inefficent + out = out - Gimpl::CovShiftForward(Umu,mu,in); + out = out - Gimpl::CovShiftBackward(Umu,mu,in); + out = out + 2.0*in + m2*in; + } + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + + + +int main(int argc, char ** argv) +{ + Grid_init(&argc, &argv); + + typedef LatticeColourVector Field; + + auto latt_size = GridDefaultLatt(); + auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + auto mpi_layout = GridDefaultMpi(); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridParallelRNG RNG(&Grid); RNG.SeedFixedIntegers(std::vector({45,12,81,9})); + + + LatticeGaugeField U(&Grid); + + SU::ColdConfiguration(RNG,U); + + typedef CovariantLaplacianCshift Laplacian_t; + Laplacian_t Laplacian(U); + + + ColourVector ColourKronecker; + ColourKronecker = Zero(); + ColourKronecker()()(0) = 1.0; + + Coordinate site({0,0,0,0}); // Point source at origin + + Field kronecker(&Grid); + kronecker = Zero(); + pokeSite(ColourKronecker,kronecker,site); + + Field psi(&Grid); psi=Zero(); + + HermitianLinearOperator HermOp(Laplacian); + SimpleConjugateGradient(HermOp, kronecker,psi); + + Field r(&Grid); + Laplacian.M(psi,r); + r=kronecker-r; + + std::cout << "True residual "<< norm2(r) < + +using namespace std; +using namespace Grid; + +template class CovariantLaplacianCshift : public SparseMatrixBase +{ +public: + INHERIT_GIMPL_TYPES(Gimpl); + + GridBase *grid; + GaugeField U; + + CovariantLaplacianCshift(GaugeField &_U) : + grid(_U.Grid()), + U(_U) { }; + + virtual GridBase *Grid(void) { return grid; }; + + virtual void M (const Field &in, Field &out) + { + out=Zero(); + for(int mu=0;mu(U, mu); // NB: Inefficent + out = out - Gimpl::CovShiftForward(Umu,mu,in); + out = out - Gimpl::CovShiftBackward(Umu,mu,in); + out = out + 2.0*in; + } + }; + virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian + virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid + virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid + virtual void MdirAll (const Field &in, std::vector &out) {assert(0);}; // Unimplemented need only for multigrid +}; + +void MakePhase(Coordinate mom,LatticeComplex &phase) +{ + GridBase *grid = phase.Grid(); + auto latt_size = grid->GlobalDimensions(); + ComplexD ci(0.0,1.0); + phase=Zero(); + + LatticeComplex coor(phase.Grid()); + for(int mu=0;mu +void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared) +{ + typedef CovariantLaplacianCshift Laplacian_t; + Laplacian_t Laplacian(U); + + Integer Iterations = 40; + Real width = 2.0; + Real coeff = (width*width) / Real(4*Iterations); + + Field tmp(U.Grid()); + smeared=unsmeared; + // chi = (1-p^2/2N)^N kronecker + for(int n = 0; n < Iterations; ++n) { + Laplacian.M(smeared,tmp); + smeared = smeared - coeff*tmp; + std::cout << " smear iter " << n<<" " < +void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator) +{ + GridBase *UGrid = D.GaugeGrid(); + GridBase *FGrid = D.FermionGrid(); + + LatticeFermion src4 (UGrid); + LatticeFermion src5 (FGrid); + LatticeFermion result5(FGrid); + LatticeFermion result4(UGrid); + + ConjugateGradient CG(1.0e-8,100000); + SchurRedBlackDiagMooeeSolve schur(CG); + ZeroGuesser ZG; // Could be a DeflatedGuesser if have eigenvectors + for(int s=0;s(src4,source,s,c); + + D.ImportPhysicalFermionSource(src4,src5); + + result5=Zero(); + schur(D,src5,result5,ZG); + std::cout<(propagator,result4,s,c); + } + } +} + +class MesonFile: Serializable { +public: + GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector >, data); +}; + +void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase) +{ + const int nchannel=4; + Gamma::Algebra Gammas[nchannel][2] = { + {Gamma::Algebra::Gamma5 ,Gamma::Algebra::Gamma5}, + {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5}, + {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5}, + {Gamma::Algebra::Gamma5 ,Gamma::Algebra::GammaTGamma5} + }; + + Gamma G5(Gamma::Algebra::Gamma5); + + LatticeComplex meson_CF(q1.Grid()); + MesonFile MF; + + for(int ch=0;ch meson_T; + sliceSum(meson_CF,meson_T, Tdir); + + int nt=meson_T.size(); + + std::vector corr(nt); + for(int t=0;t seeds4({1,2,3,4}); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + + LatticeGaugeField Umu(UGrid); + std::string config; + if( argc > 1 && argv[1][0] != '-' ) + { + std::cout<::ColdConfiguration(Umu); + // SU::HotConfiguration(RNG4,Umu); + config="HotConfig"; + } + + std::vector masses({ 0.03,0.04,0.45} ); // u/d, s, c ?? + + int nmass = masses.size(); + + std::vector FermActs; + + std::cout< PointProps(nmass,UGrid); + std::vector GaussProps(nmass,UGrid); + std::vector Z2Props (nmass,UGrid); + + for(int m=0;m Make.inc echo >> Make.inc echo CCFILES=$CCFILES >> Make.inc - +echo ZWILS_FERMION_FILES=$ZWILS_FERMION_FILES >> Make.inc +echo WILS_FERMION_FILES=$WILS_FERMION_FILES >> Make.inc +echo STAG_FERMION_FILES=$STAG_FERMION_FILES >> Make.inc +echo GP_FERMION_FILES=$GP_FERMION_FILES >> Make.inc +echo ADJ_FERMION_FILES=$ADJ_FERMION_FILES >> Make.inc +echo TWOIND_FERMION_FILES=$TWOIND_FERMION_FILES >> Make.inc # tests Make.inc cd $home/tests @@ -26,11 +40,10 @@ for subdir in $dirs; do echo "tests-local: ${TESTLIST} " > Make.inc echo ${PREF}_PROGRAMS = ${TESTLIST} >> Make.inc echo >> Make.inc - HADLINK=`[ $subdir = './hadrons' ] && echo '-lHadrons '` for f in $TESTS; do BNAME=`basename $f .cc` echo ${BNAME}_SOURCES=$f >> Make.inc - echo ${BNAME}_LDADD=${HADLINK}-lGrid >> Make.inc + echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a' >> Make.inc echo >> Make.inc done if [ $subdir != '.' ]; then @@ -49,7 +62,7 @@ echo >> Make.inc for f in $TESTS; do BNAME=`basename $f .cc` echo ${BNAME}_SOURCES=$f >> Make.inc - echo ${BNAME}_LDADD=-lGrid>> Make.inc + echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a' >> Make.inc echo >> Make.inc done cd .. @@ -65,7 +78,22 @@ echo >> Make.inc for f in $TESTS; do BNAME=`basename $f .cc` echo ${BNAME}_SOURCES=$f >> Make.inc - echo ${BNAME}_LDADD=-lGrid>> Make.inc + echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a'>> Make.inc + echo >> Make.inc +done +cd .. + +# examples Make.inc +cd $home/examples/ +echo> Make.inc +TESTS=`ls *.cc` +TESTLIST=`echo ${TESTS} | sed s/.cc//g ` +echo bin_PROGRAMS = ${TESTLIST} > Make.inc +echo >> Make.inc +for f in $TESTS; do + BNAME=`basename $f .cc` + echo ${BNAME}_SOURCES=$f >> Make.inc + echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a'>> Make.inc echo >> Make.inc done cd .. diff --git a/tests/IO/Test_ildg_io.cc b/tests/IO/Test_ildg_io.cc index 369acfd1..d6e21101 100644 --- a/tests/IO/Test_ildg_io.cc +++ b/tests/IO/Test_ildg_io.cc @@ -69,7 +69,7 @@ int main (int argc, char ** argv) std::vector U(4,&Fine); - SU3::HotConfiguration(pRNGa,Umu); + SU::HotConfiguration(pRNGa,Umu); FieldMetaData header; diff --git a/tests/IO/Test_nersc_io.cc b/tests/IO/Test_nersc_io.cc index f5413e3b..c15c320e 100644 --- a/tests/IO/Test_nersc_io.cc +++ b/tests/IO/Test_nersc_io.cc @@ -84,7 +84,7 @@ int main (int argc, char ** argv) std::vector U(4,&Fine); - SU3::HotConfiguration(pRNGa,Umu); + SU::HotConfiguration(pRNGa,Umu); FieldMetaData header; std::string file("./ckpoint_lat.4000"); diff --git a/tests/Test_cayley_even_odd_vec.cc b/tests/Test_cayley_even_odd_vec.cc index 0e71d910..c345efd9 100644 --- a/tests/Test_cayley_even_odd_vec.cc +++ b/tests/Test_cayley_even_odd_vec.cc @@ -80,7 +80,7 @@ int main (int argc, char ** argv) GridParallelRNG sRNG5(sFGrid); sRNG5.SeedFixedIntegers(seeds5); LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4,Umu); + SU::HotConfiguration(RNG4,Umu); RealD mass=0.1; RealD M5 =1.8; diff --git a/tests/Test_compressed_lanczos_hot_start.cc b/tests/Test_compressed_lanczos_hot_start.cc index 8eb7a921..dc22cfca 100644 --- a/tests/Test_compressed_lanczos_hot_start.cc +++ b/tests/Test_compressed_lanczos_hot_start.cc @@ -202,7 +202,7 @@ int main (int argc, char ** argv) { std::vector seeds4({1,2,3,4}); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4,Umu); + SU::HotConfiguration(RNG4,Umu); // FieldMetaData header; // NerscIO::readConfiguration(Umu,header,Params.config); diff --git a/tests/Test_dwf_mixedcg_prec.cc b/tests/Test_dwf_mixedcg_prec.cc index be881db9..da0b54cd 100644 --- a/tests/Test_dwf_mixedcg_prec.cc +++ b/tests/Test_dwf_mixedcg_prec.cc @@ -71,7 +71,7 @@ int main (int argc, char ** argv) LatticeGaugeFieldD Umu(UGrid); LatticeGaugeFieldF Umu_f(UGrid_f); - SU3::HotConfiguration(RNG4,Umu); + SU::HotConfiguration(RNG4,Umu); precisionChange(Umu_f,Umu); diff --git a/tests/Test_dwf_mixedcg_prec_halfcomms.cc b/tests/Test_dwf_mixedcg_prec_halfcomms.cc index 4d94632c..8b0126dc 100644 --- a/tests/Test_dwf_mixedcg_prec_halfcomms.cc +++ b/tests/Test_dwf_mixedcg_prec_halfcomms.cc @@ -69,7 +69,7 @@ int main (int argc, char ** argv) LatticeGaugeFieldD Umu(UGrid); LatticeGaugeFieldF Umu_f(UGrid_f); - SU3::HotConfiguration(RNG4,Umu); + SU::HotConfiguration(RNG4,Umu); precisionChange(Umu_f,Umu); diff --git a/tests/core/Test_cf_coarsen_support.cc b/tests/core/Test_cf_coarsen_support.cc index e787905e..ad0309b9 100644 --- a/tests/core/Test_cf_coarsen_support.cc +++ b/tests/core/Test_cf_coarsen_support.cc @@ -64,7 +64,7 @@ int main (int argc, char ** argv) LatticeFermion ref(FGrid); ref=Zero(); LatticeFermion tmp(FGrid); LatticeFermion err(FGrid); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); for(int mu=0;mu::HotConfiguration(RNG4,Umu); // std::vector U(4,UGrid); // for(int mu=0;mu::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); RealD mass=0.1; diff --git a/tests/core/Test_dwf_eofa_even_odd.cc b/tests/core/Test_dwf_eofa_even_odd.cc index 01fff9ea..64701069 100644 --- a/tests/core/Test_dwf_eofa_even_odd.cc +++ b/tests/core/Test_dwf_eofa_even_odd.cc @@ -73,7 +73,7 @@ int main (int argc, char ** argv) LatticeFermion ref (FGrid); ref = Zero(); LatticeFermion tmp (FGrid); tmp = Zero(); LatticeFermion err (FGrid); err = Zero(); - LatticeGaugeField Umu (UGrid); SU3::HotConfiguration(RNG4, Umu); + LatticeGaugeField Umu (UGrid); SU::HotConfiguration(RNG4, Umu); std::vector U(4,UGrid); // Only one non-zero (y) diff --git a/tests/core/Test_dwf_even_odd.cc b/tests/core/Test_dwf_even_odd.cc index 6093ee8f..4918f02a 100644 --- a/tests/core/Test_dwf_even_odd.cc +++ b/tests/core/Test_dwf_even_odd.cc @@ -72,7 +72,7 @@ int main (int argc, char ** argv) LatticeFermion ref(FGrid); ref=Zero(); LatticeFermion tmp(FGrid); tmp=Zero(); LatticeFermion err(FGrid); tmp=Zero(); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); // Only one non-zero (y) diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc index 2ba3752b..212b1a35 100644 --- a/tests/core/Test_fft.cc +++ b/tests/core/Test_fft.cc @@ -138,7 +138,7 @@ int main (int argc, char ** argv) LatticeGaugeFieldD Umu(&GRID); - SU3::ColdConfiguration(pRNG,Umu); // Unit gauge + SU::ColdConfiguration(pRNG,Umu); // Unit gauge // Umu=Zero(); //////////////////////////////////////////////////// // Wilson test diff --git a/tests/core/Test_fft_gfix.cc b/tests/core/Test_fft_gfix.cc index 228770a8..87dbc242 100644 --- a/tests/core/Test_fft_gfix.cc +++ b/tests/core/Test_fft_gfix.cc @@ -73,11 +73,11 @@ int main (int argc, char ** argv) LatticeColourMatrix xform2(&GRID); // Gauge xform LatticeColourMatrix xform3(&GRID); // Gauge xform - SU3::ColdConfiguration(pRNG,Umu); // Unit gauge + SU::ColdConfiguration(pRNG,Umu); // Unit gauge Uorg=Umu; Urnd=Umu; - SU3::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge + SU::RandomGaugeTransform(pRNG,Urnd,g); // Unit gauge Real plaq=WilsonLoops::avgPlaquette(Umu); std::cout << " Initial plaquette "<::HotConfiguration(pRNG,Umu); // Unit gauge plaq=WilsonLoops::avgPlaquette(Umu); std::cout << " Initial plaquette "<::HotConfiguration(pRNG,Umu); // Unit gauge plaq=WilsonLoops::avgPlaquette(Umu); std::cout << " Initial plaquette "<::HotConfiguration(RNG4_2f,Umu_2f); StandardFermionField src (FGrid_2f); StandardFermionField tmpsrc(FGrid_2f); diff --git a/tests/core/Test_gpwilson_even_odd.cc b/tests/core/Test_gpwilson_even_odd.cc index bf37f4d5..69ace859 100644 --- a/tests/core/Test_gpwilson_even_odd.cc +++ b/tests/core/Test_gpwilson_even_odd.cc @@ -61,7 +61,7 @@ int main (int argc, char ** argv) FermionField ref(&Grid); ref=Zero(); FermionField tmp(&Grid); tmp=Zero(); FermionField err(&Grid); tmp=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); double volume=1; diff --git a/tests/core/Test_lie_generators.cc b/tests/core/Test_lie_generators.cc index 471cea25..e044378c 100644 --- a/tests/core/Test_lie_generators.cc +++ b/tests/core/Test_lie_generators.cc @@ -66,7 +66,7 @@ int main(int argc, char** argv) { std::cout << GridLogMessage << "*********************************************" << std::endl; - std::cout << GridLogMessage << "* Generators for SU(3)" << std::endl; + std::cout << GridLogMessage << "* Generators for SU(Nc" << std::endl; std::cout << GridLogMessage << "*********************************************" << std::endl; SU3::printGenerators(); @@ -114,8 +114,8 @@ int main(int argc, char** argv) { LatticeGaugeField U(grid), V(grid); - SU::HotConfiguration(gridRNG, U); - SU::HotConfiguration(gridRNG, V); + SU3::HotConfiguration(gridRNG, U); + SU3::HotConfiguration(gridRNG, V); // Adjoint representation // Test group structure @@ -123,8 +123,8 @@ int main(int argc, char** argv) { LatticeGaugeField UV(grid); UV = Zero(); for (int mu = 0; mu < Nd; mu++) { - SU::LatticeMatrix Umu = peekLorentz(U,mu); - SU::LatticeMatrix Vmu = peekLorentz(V,mu); + SU3::LatticeMatrix Umu = peekLorentz(U,mu); + SU3::LatticeMatrix Vmu = peekLorentz(V,mu); pokeLorentz(UV,Umu*Vmu, mu); } @@ -151,16 +151,16 @@ int main(int argc, char** argv) { // Check correspondence of algebra and group transformations // Create a random vector - SU::LatticeAlgebraVector h_adj(grid); + SU3::LatticeAlgebraVector h_adj(grid); typename AdjointRep::LatticeMatrix Ar(grid); random(gridRNG,h_adj); h_adj = real(h_adj); SU_Adjoint::AdjointLieAlgebraMatrix(h_adj,Ar); // Re-extract h_adj - SU::LatticeAlgebraVector h_adj2(grid); + SU3::LatticeAlgebraVector h_adj2(grid); SU_Adjoint::projectOnAlgebra(h_adj2, Ar); - SU::LatticeAlgebraVector h_diff = h_adj - h_adj2; + SU3::LatticeAlgebraVector h_diff = h_adj - h_adj2; std::cout << GridLogMessage << "Projections structure check vector difference (Adjoint representation) : " << norm2(h_diff) << std::endl; // Exponentiate @@ -183,14 +183,14 @@ int main(int argc, char** argv) { // Construct the fundamental matrix in the group - SU::LatticeMatrix Af(grid); - SU::FundamentalLieAlgebraMatrix(h_adj,Af); - SU::LatticeMatrix Ufund(grid); + SU3::LatticeMatrix Af(grid); + SU3::FundamentalLieAlgebraMatrix(h_adj,Af); + SU3::LatticeMatrix Ufund(grid); Ufund = expMat(Af, 1.0, 16); // Check unitarity - SU::LatticeMatrix uno_f(grid); + SU3::LatticeMatrix uno_f(grid); uno_f = 1.0; - SU::LatticeMatrix UnitCheck(grid); + SU3::LatticeMatrix UnitCheck(grid); UnitCheck = Ufund * adj(Ufund) - uno_f; std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck) << std::endl; @@ -311,14 +311,14 @@ int main(int argc, char** argv) { // Test group structure // (U_f * V_f)_r = U_r * V_r LatticeGaugeField U2(grid), V2(grid); - SU::HotConfiguration(gridRNG, U2); - SU::HotConfiguration(gridRNG, V2); + SU3::HotConfiguration(gridRNG, U2); + SU3::HotConfiguration(gridRNG, V2); LatticeGaugeField UV2(grid); UV2 = Zero(); for (int mu = 0; mu < Nd; mu++) { - SU::LatticeMatrix Umu2 = peekLorentz(U2,mu); - SU::LatticeMatrix Vmu2 = peekLorentz(V2,mu); + SU3::LatticeMatrix Umu2 = peekLorentz(U2,mu); + SU3::LatticeMatrix Vmu2 = peekLorentz(V2,mu); pokeLorentz(UV2,Umu2*Vmu2, mu); } @@ -345,16 +345,16 @@ int main(int argc, char** argv) { // Check correspondence of algebra and group transformations // Create a random vector - SU::LatticeAlgebraVector h_sym(grid); + SU3::LatticeAlgebraVector h_sym(grid); typename TwoIndexRep< Nc, Symmetric>::LatticeMatrix Ar_sym(grid); random(gridRNG,h_sym); h_sym = real(h_sym); SU_TwoIndex::TwoIndexLieAlgebraMatrix(h_sym,Ar_sym); // Re-extract h_sym - SU::LatticeAlgebraVector h_sym2(grid); + SU3::LatticeAlgebraVector h_sym2(grid); SU_TwoIndex< Nc, Symmetric>::projectOnAlgebra(h_sym2, Ar_sym); - SU::LatticeAlgebraVector h_diff_sym = h_sym - h_sym2; + SU3::LatticeAlgebraVector h_diff_sym = h_sym - h_sym2; std::cout << GridLogMessage << "Projections structure check vector difference (Two Index Symmetric): " << norm2(h_diff_sym) << std::endl; @@ -379,11 +379,11 @@ int main(int argc, char** argv) { // Construct the fundamental matrix in the group - SU::LatticeMatrix Af_sym(grid); - SU::FundamentalLieAlgebraMatrix(h_sym,Af_sym); - SU::LatticeMatrix Ufund2(grid); + SU3::LatticeMatrix Af_sym(grid); + SU3::FundamentalLieAlgebraMatrix(h_sym,Af_sym); + SU3::LatticeMatrix Ufund2(grid); Ufund2 = expMat(Af_sym, 1.0, 16); - SU::LatticeMatrix UnitCheck2(grid); + SU3::LatticeMatrix UnitCheck2(grid); UnitCheck2 = Ufund2 * adj(Ufund2) - uno_f; std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2) << std::endl; @@ -421,14 +421,14 @@ int main(int argc, char** argv) { // Test group structure // (U_f * V_f)_r = U_r * V_r LatticeGaugeField U2A(grid), V2A(grid); - SU::HotConfiguration(gridRNG, U2A); - SU::HotConfiguration(gridRNG, V2A); + SU3::HotConfiguration(gridRNG, U2A); + SU3::HotConfiguration(gridRNG, V2A); LatticeGaugeField UV2A(grid); UV2A = Zero(); for (int mu = 0; mu < Nd; mu++) { - SU::LatticeMatrix Umu2A = peekLorentz(U2,mu); - SU::LatticeMatrix Vmu2A = peekLorentz(V2,mu); + SU3::LatticeMatrix Umu2A = peekLorentz(U2,mu); + SU3::LatticeMatrix Vmu2A = peekLorentz(V2,mu); pokeLorentz(UV2A,Umu2A*Vmu2A, mu); } @@ -455,16 +455,16 @@ int main(int argc, char** argv) { // Check correspondence of algebra and group transformations // Create a random vector - SU::LatticeAlgebraVector h_Asym(grid); + SU3::LatticeAlgebraVector h_Asym(grid); typename TwoIndexRep< Nc, AntiSymmetric>::LatticeMatrix Ar_Asym(grid); random(gridRNG,h_Asym); h_Asym = real(h_Asym); SU_TwoIndex< Nc, AntiSymmetric>::TwoIndexLieAlgebraMatrix(h_Asym,Ar_Asym); // Re-extract h_sym - SU::LatticeAlgebraVector h_Asym2(grid); + SU3::LatticeAlgebraVector h_Asym2(grid); SU_TwoIndex< Nc, AntiSymmetric>::projectOnAlgebra(h_Asym2, Ar_Asym); - SU::LatticeAlgebraVector h_diff_Asym = h_Asym - h_Asym2; + SU3::LatticeAlgebraVector h_diff_Asym = h_Asym - h_Asym2; std::cout << GridLogMessage << "Projections structure check vector difference (Two Index anti-Symmetric): " << norm2(h_diff_Asym) << std::endl; @@ -489,11 +489,11 @@ int main(int argc, char** argv) { // Construct the fundamental matrix in the group - SU::LatticeMatrix Af_Asym(grid); - SU::FundamentalLieAlgebraMatrix(h_Asym,Af_Asym); - SU::LatticeMatrix Ufund2A(grid); + SU3::LatticeMatrix Af_Asym(grid); + SU3::FundamentalLieAlgebraMatrix(h_Asym,Af_Asym); + SU3::LatticeMatrix Ufund2A(grid); Ufund2A = expMat(Af_Asym, 1.0, 16); - SU::LatticeMatrix UnitCheck2A(grid); + SU3::LatticeMatrix UnitCheck2A(grid); UnitCheck2A = Ufund2A * adj(Ufund2A) - uno_f; std::cout << GridLogMessage << "unitarity check 1: " << norm2(UnitCheck2A) << std::endl; diff --git a/tests/core/Test_main.cc b/tests/core/Test_main.cc index af8b747b..6e316aa6 100644 --- a/tests/core/Test_main.cc +++ b/tests/core/Test_main.cc @@ -231,6 +231,20 @@ int main(int argc, char **argv) { scalar = localInnerProduct(cVec, cVec); scalar = localNorm2(cVec); + std::cout << "Testing maxLocalNorm2" < * Lattice SU. double t0, t1, flops; double bytes; int ncall = 5000; @@ -549,7 +563,8 @@ int main(int argc, char **argv) { std::vector shiftcoor = coor; shiftcoor[dir] = (shiftcoor[dir] + shift + latt_size[dir]) % - (latt_size[dir] / mpi_layout[dir]); + (latt_size[dir]); + // (latt_size[dir] / mpi_layout[dir]); std::vector rl(4); for (int dd = 0; dd < 4; dd++) { diff --git a/tests/core/Test_mobius_eofa_even_odd.cc b/tests/core/Test_mobius_eofa_even_odd.cc index 68091229..7339f156 100644 --- a/tests/core/Test_mobius_eofa_even_odd.cc +++ b/tests/core/Test_mobius_eofa_even_odd.cc @@ -73,7 +73,7 @@ int main (int argc, char ** argv) LatticeFermion ref (FGrid); ref = Zero(); LatticeFermion tmp (FGrid); tmp = Zero(); LatticeFermion err (FGrid); err = Zero(); - LatticeGaugeField Umu (UGrid); SU3::HotConfiguration(RNG4, Umu); + LatticeGaugeField Umu (UGrid); SU::HotConfiguration(RNG4, Umu); std::vector U(4,UGrid); // Only one non-zero (y) diff --git a/tests/core/Test_quenched_update.cc b/tests/core/Test_quenched_update.cc index ef428d1b..22675913 100644 --- a/tests/core/Test_quenched_update.cc +++ b/tests/core/Test_quenched_update.cc @@ -55,7 +55,7 @@ int main (int argc, char ** argv) GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds); GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds); - // SU3 colour operatoions + // SU colour operatoions LatticeColourMatrix link(grid); LatticeColourMatrix staple(grid); @@ -87,10 +87,10 @@ int main (int argc, char ** argv) link = PeekIndex(Umu,mu); - for( int subgroup=0;subgroup::su2subgroups();subgroup++ ) { // update Even checkerboard - SU3::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask); + SU::SubGroupHeatBath(sRNG,pRNG,beta,link,staple,subgroup,20,mask); } diff --git a/tests/core/Test_reunitarise.cc b/tests/core/Test_reunitarise.cc new file mode 100644 index 00000000..6644be1a --- /dev/null +++ b/tests/core/Test_reunitarise.cc @@ -0,0 +1,145 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_quenched_update.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt({8,8,8,8}); + GridCartesian * grid = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexD::Nsimd()), + GridDefaultMpi()); + + GridCartesian * gridF = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexF::Nsimd()), + GridDefaultMpi()); + + + /////////////////////////////// + // Configuration of known size + /////////////////////////////// + LatticeColourMatrixD ident(grid); + LatticeColourMatrixD U(grid); + LatticeColourMatrixD UU(grid); + LatticeColourMatrixD tmp(grid); + LatticeColourMatrixD org(grid); + LatticeColourMatrixF UF(gridF); + + LatticeGaugeField Umu(grid); + + ident =1.0; + + // RNG set up for test + std::vector pseeds({1,2,3,4,5}); // once I caught a fish alive + std::vector sseeds({6,7,8,9,10});// then i let it go again + GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds); + + SU::HotConfiguration(pRNG,Umu); + + U = PeekIndex(Umu,0); + org=U; + + + tmp= U*adj(U) - ident ; + RealD Def1 = norm2( tmp ); + std::cout << " Defect1 "<(U,Nc-1,i); + element = element * phase; + PokeIndex(U,element,Nc-1,i); + } + U=U*0.1; + UU=U; + + detU= Determinant(U) ; + detU=detU-1.0; + std::cout << "Determinant defect before projection " <::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); diff --git a/tests/core/Test_staggered5D.cc b/tests/core/Test_staggered5D.cc index 3d175890..6ab15873 100644 --- a/tests/core/Test_staggered5D.cc +++ b/tests/core/Test_staggered5D.cc @@ -75,7 +75,7 @@ int main (int argc, char ** argv) FermionField phi (FGrid); random(pRNG5,phi); FermionField chi (FGrid); random(pRNG5,chi); - LatticeGaugeField Umu(UGrid); SU3::ColdConfiguration(pRNG4,Umu); + LatticeGaugeField Umu(UGrid); SU::ColdConfiguration(pRNG4,Umu); LatticeGaugeField Umua(UGrid); Umua=Umu; double volume=Ls; diff --git a/tests/core/Test_staggered5Dvec.cc b/tests/core/Test_staggered5Dvec.cc index 73241276..ef8da662 100644 --- a/tests/core/Test_staggered5Dvec.cc +++ b/tests/core/Test_staggered5Dvec.cc @@ -84,7 +84,7 @@ int main (int argc, char ** argv) FermionField chi (FGrid); random(pRNG5,chi); LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(pRNG4,Umu); + SU::HotConfiguration(pRNG4,Umu); /* for(int mu=1;mu<4;mu++){ diff --git a/tests/core/Test_staggered5DvecF.cc b/tests/core/Test_staggered5DvecF.cc index 2386d054..6893551c 100644 --- a/tests/core/Test_staggered5DvecF.cc +++ b/tests/core/Test_staggered5DvecF.cc @@ -83,7 +83,7 @@ int main (int argc, char ** argv) FermionField chi (FGrid); random(pRNG5,chi); LatticeGaugeFieldF Umu(UGrid); - SU3::HotConfiguration(pRNG4,Umu); + SU::HotConfiguration(pRNG4,Umu); /* for(int mu=1;mu<4;mu++){ diff --git a/tests/core/Test_staggered_naive.cc b/tests/core/Test_staggered_naive.cc index 9fe35a54..f41d723d 100644 --- a/tests/core/Test_staggered_naive.cc +++ b/tests/core/Test_staggered_naive.cc @@ -64,7 +64,7 @@ int main (int argc, char ** argv) FermionField err(&Grid); tmp=Zero(); FermionField phi (&Grid); random(pRNG,phi); FermionField chi (&Grid); random(pRNG,chi); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); diff --git a/tests/core/Test_unary.cc b/tests/core/Test_unary.cc new file mode 100644 index 00000000..2ad6ba7b --- /dev/null +++ b/tests/core/Test_unary.cc @@ -0,0 +1,106 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_quenched_update.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt({8,8,8,8}); + GridCartesian * grid = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexD::Nsimd()), + GridDefaultMpi()); + + GridCartesian * gridF = SpaceTimeGrid::makeFourDimGrid(latt, + GridDefaultSimd(Nd,vComplexF::Nsimd()), + GridDefaultMpi()); + + + /////////////////////////////// + // Configuration of known size + /////////////////////////////// + LatticeColourMatrixD ident(grid); + LatticeColourMatrixD U(grid); + LatticeColourMatrixD tmp(grid); + LatticeColourMatrixD org(grid); + LatticeColourMatrixF UF(gridF); + + LatticeGaugeField Umu(grid); + + ident =1.0; + + // RNG set up for test + std::vector pseeds({1,2,3,4,5}); // once I caught a fish alive + std::vector sseeds({6,7,8,9,10});// then i let it go again + GridParallelRNG pRNG(grid); pRNG.SeedFixedIntegers(pseeds); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(sseeds); + + SU::HotConfiguration(pRNG,Umu); + + U = PeekIndex(Umu,0); + org=U; + + + tmp= U*adj(U) - ident ; + RealD Def1 = norm2( tmp ); + std::cout << " Defect1 "< latt_size ({N,4,4}); - std::vector simd_layout({vComplexD::Nsimd(),1,1}); - std::vector mpi_layout ({1,1,1}); + std::vector latt_size ({N,N,N,N}); + std::vector simd_layout({vComplexD::Nsimd(),1,1,1}); + std::vector mpi_layout ({1,1,1,1}); int vol = 1; int nd = latt_size.size(); @@ -69,7 +69,7 @@ int main (int argc, char ** argv) for(int t=0;t +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; + ; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< latt_size ({N,4,4}); + std::vector simd_layout({vComplexD::Nsimd(),1,1}); + std::vector mpi_layout ({1,1,1}); + + int vol = 1; + int nd = latt_size.size(); + for(int d=0;d({45,12,81,9})); + + std::cout<::HotConfiguration(pRNG, Umu); std::vector U(4, &Grid); double volume = 1; diff --git a/tests/core/Test_wilson_even_odd.cc b/tests/core/Test_wilson_even_odd.cc index dc49cf81..e7733a79 100644 --- a/tests/core/Test_wilson_even_odd.cc +++ b/tests/core/Test_wilson_even_odd.cc @@ -70,7 +70,7 @@ int main (int argc, char ** argv) LatticeFermion tmp(&Grid); tmp=Zero(); LatticeFermion err(&Grid); tmp=Zero(); LatticeGaugeField Umu(&Grid); - SU3::HotConfiguration(pRNG,Umu); + SU::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); double volume=1; diff --git a/tests/core/Test_wilson_twisted_mass_even_odd.cc b/tests/core/Test_wilson_twisted_mass_even_odd.cc index ba80fd0e..e0f73456 100644 --- a/tests/core/Test_wilson_twisted_mass_even_odd.cc +++ b/tests/core/Test_wilson_twisted_mass_even_odd.cc @@ -71,7 +71,7 @@ int main (int argc, char ** argv) LatticeFermion ref(&Grid); ref=Zero(); LatticeFermion tmp(&Grid); tmp=Zero(); LatticeFermion err(&Grid); tmp=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); double volume=1; diff --git a/tests/debug/Test_cayley_cg.cc b/tests/debug/Test_cayley_cg.cc index 5a9c696f..5418a8af 100644 --- a/tests/debug/Test_cayley_cg.cc +++ b/tests/debug/Test_cayley_cg.cc @@ -116,7 +116,7 @@ int main (int argc, char ** argv) LatticeGaugeField Umu(UGrid); LatticeGaugeFieldF UmuF(UGridF); - SU3::HotConfiguration(RNG4,Umu); + SU::HotConfiguration(RNG4,Umu); precisionChange(UmuF,Umu); std::vector U(4,UGrid); diff --git a/tests/debug/Test_cayley_coarsen_support.cc b/tests/debug/Test_cayley_coarsen_support.cc index e91b3070..b2f691d7 100644 --- a/tests/debug/Test_cayley_coarsen_support.cc +++ b/tests/debug/Test_cayley_coarsen_support.cc @@ -77,7 +77,7 @@ int main (int argc, char ** argv) LatticeFermion ref(FGrid); ref=Zero(); LatticeFermion tmp(FGrid); LatticeFermion err(FGrid); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4,Umu); #if 0 std::vector U(4,UGrid); diff --git a/tests/debug/Test_cayley_even_odd.cc b/tests/debug/Test_cayley_even_odd.cc index 433f0722..5e800b26 100644 --- a/tests/debug/Test_cayley_even_odd.cc +++ b/tests/debug/Test_cayley_even_odd.cc @@ -70,7 +70,7 @@ int main (int argc, char ** argv) GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); RealD mass=0.1; diff --git a/tests/debug/Test_cayley_ldop_cr.cc b/tests/debug/Test_cayley_ldop_cr.cc index 82f388ab..416017e5 100644 --- a/tests/debug/Test_cayley_ldop_cr.cc +++ b/tests/debug/Test_cayley_ldop_cr.cc @@ -71,9 +71,9 @@ int main (int argc, char ** argv) std::string file("./ckpoint_lat.400"); NerscIO::readConfiguration(Umu,header,file); - // SU3::ColdConfiguration(RNG4,Umu); - // SU3::TepidConfiguration(RNG4,Umu); - // SU3::HotConfiguration(RNG4,Umu); + // SU::ColdConfiguration(RNG4,Umu); + // SU::TepidConfiguration(RNG4,Umu); + // SU::HotConfiguration(RNG4,Umu); // Umu=Zero(); RealD mass=0.1; diff --git a/tests/debug/Test_cayley_mres.cc b/tests/debug/Test_cayley_mres.cc index 2ad605b8..bfbc3cf7 100644 --- a/tests/debug/Test_cayley_mres.cc +++ b/tests/debug/Test_cayley_mres.cc @@ -33,13 +33,14 @@ using namespace Grid; template -void TestConserved(What & Ddwf, What & Ddwfrev, +void TestConserved(What & Ddwf, LatticeGaugeField &Umu, GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid, GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid, RealD mass, RealD M5, GridParallelRNG *RNG4, - GridParallelRNG *RNG5); + GridParallelRNG *RNG5, + What *Ddwfrev=nullptr); Gamma::Algebra Gmu [] = { Gamma::Algebra::GammaX, @@ -102,14 +103,25 @@ int main (int argc, char ** argv) GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF); - std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); - GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG RNG4(UGrid); + std::vector seeds4({1,2,3,4}); RNG4.SeedFixedIntegers(seeds4); + //const std::string seeds4{ "test-gauge-3000" }; RNG4.SeedUniqueString( seeds4 ); LatticeGaugeField Umu(UGrid); - SU3::ColdConfiguration(Umu); - // SU3::HotConfiguration(RNG4,Umu); + if( argc > 1 && argv[1][0] != '-' ) + { + std::cout<::ColdConfiguration(Umu); + SU::HotConfiguration(RNG4,Umu); + } RealD mass=0.3; RealD M5 =1.0; @@ -117,7 +129,7 @@ int main (int argc, char ** argv) std::cout<(Ddwf,Ddwf,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestConserved(Ddwf,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); RealD b=1.5;// Scale factor b+c=2, b-c=1 RealD c=0.5; @@ -127,13 +139,13 @@ int main (int argc, char ** argv) std::cout<(Dmob,Dmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestConserved(Dmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(Dsham,Dsham,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestConserved(Dsham,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); std::cout<(ZDmob,ZDmobrev,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5); + TestConserved(ZDmob,Umu,FGrid,FrbGrid,UGrid,UrbGrid,mass,M5,&RNG4,&RNG5,&ZDmobrev); Grid_finalize(); } @@ -151,22 +162,17 @@ int main (int argc, char ** argv) template -void TestConserved(Action & Ddwf, - Action & Ddwfrev, +void TestConserved(Action & Ddwf, LatticeGaugeField &Umu, GridCartesian * FGrid, GridRedBlackCartesian * FrbGrid, GridCartesian * UGrid, GridRedBlackCartesian * UrbGrid, RealD mass, RealD M5, GridParallelRNG *RNG4, - GridParallelRNG *RNG5) + GridParallelRNG *RNG5, + Action * Ddwfrev) { - int Ls=Ddwf.Ls; - - LatticePropagator phys_src(UGrid); - - std::vector U(4,UGrid); - - LatticePropagator seqsrc(FGrid); + LatticePropagator phys_src(UGrid); + LatticePropagator seqsrc(FGrid); LatticePropagator prop5(FGrid); LatticePropagator prop5rev(FGrid); LatticePropagator prop4(UGrid); @@ -184,9 +190,9 @@ void TestConserved(Action & Ddwf, phys_src=Zero(); pokeSite(kronecker,phys_src,coor); - MdagMLinearOperator HermOp(Ddwf); - MdagMLinearOperator HermOprev(Ddwfrev); ConjugateGradient CG(1.0e-16,100000); + SchurRedBlackDiagTwoSolve schur(CG); + ZeroGuesser zpg; for(int s=0;s(prop5,result5,s,c); LatticeFermion result4(UGrid); Ddwf.ExportPhysicalFermionSolution(result5,result4); FermToProp(prop4,result4,s,c); - Ddwfrev.ImportPhysicalFermionSource(src4,src5); - Ddwfrev.Mdag(src5,Mdagsrc5); - CG(HermOprev,Mdagsrc5,result5); + if( Ddwfrev ) { + Ddwfrev->ImportPhysicalFermionSource(src4,src5); + result5 = Zero(); + schur(*Ddwfrev,src5,result5,zpg); + } FermToProp(prop5rev,result5,s,c); } } @@ -241,11 +247,7 @@ void TestConserved(Action & Ddwf, PropToFerm(src5,seqsrc,s,c); LatticeFermion result5(FGrid); result5=Zero(); - - // CGNE - LatticeFermion Mdagsrc5 (FGrid); - Ddwf.Mdag(src5,Mdagsrc5); - CG(HermOp,Mdagsrc5,result5); + schur(Ddwf,src5,result5,zpg); LatticeFermion result4(UGrid); Ddwf.ExportPhysicalFermionSolution(result5,result4); @@ -266,10 +268,10 @@ void TestConserved(Action & Ddwf, Ddwf.ContractConservedCurrent(prop5rev,prop5,Vector_mu,phys_src,Current::Vector,Tdir); Ddwf.ContractJ5q(prop5,PJ5q); - PA = trace(g5*Axial_mu); - SV = trace(Vector_mu); - VV = trace(gT*Vector_mu); - PP = trace(adj(prop4)*prop4); + PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current + SV = trace(Vector_mu); // Scalar-Vector conserved current + VV = trace(gT*Vector_mu); // (local) Vector-Vector conserved current + PP = trace(adj(prop4)*prop4); // Pseudoscalar density // Spatial sum sliceSum(PA,sumPA,Tdir); @@ -278,15 +280,17 @@ void TestConserved(Action & Ddwf, sliceSum(PP,sumPP,Tdir); sliceSum(PJ5q,sumPJ5q,Tdir); - int Nt=sumPA.size(); + const int Nt{static_cast(sumPA.size())}; + std::cout< seeds4({1, 2, 3, 4}); std::vector seeds5({5, 6, 7, 8}); + GridSerialRNG sRNG; GridParallelRNG RNG5(FGrid); + sRNG.SeedFixedIntegers(seeds5); RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); DomainWallEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mpv, 0.0, -1, M5); DomainWallEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0, 1, M5); @@ -84,7 +86,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, false); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu,sRNG, RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } @@ -94,7 +96,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, true); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu,sRNG, RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } diff --git a/tests/debug/Test_heatbath_dwf_eofa_gparity.cc b/tests/debug/Test_heatbath_dwf_eofa_gparity.cc index cc118d1d..7eabfc65 100644 --- a/tests/debug/Test_heatbath_dwf_eofa_gparity.cc +++ b/tests/debug/Test_heatbath_dwf_eofa_gparity.cc @@ -74,10 +74,13 @@ int main(int argc, char** argv) RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridSerialRNG sRNG; + RNG4.SeedFixedIntegers(seeds4); + sRNG.SeedFixedIntegers(seeds5); // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); // GparityDomainWallFermionR::ImplParams params; FermionAction::ImplParams params; @@ -90,7 +93,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, false); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu,sRNG, RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } @@ -100,7 +103,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, true); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu,sRNG, RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } diff --git a/tests/debug/Test_heatbath_mobius_eofa.cc b/tests/debug/Test_heatbath_mobius_eofa.cc index 95ab935e..48806642 100644 --- a/tests/debug/Test_heatbath_mobius_eofa.cc +++ b/tests/debug/Test_heatbath_mobius_eofa.cc @@ -68,14 +68,16 @@ int main(int argc, char** argv) // Set up RNGs std::vector seeds4({1, 2, 3, 4}); std::vector seeds5({5, 6, 7, 8}); + GridSerialRNG sRNG; GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + sRNG.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); MobiusEOFAFermionR Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mpv, 0.0, -1, M5, b, c); MobiusEOFAFermionR Rop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mpv, mf, mpv, -1.0, 1, M5, b, c); @@ -86,7 +88,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, false); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu, sRNG,RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } @@ -96,7 +98,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, true); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu, sRNG,RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } diff --git a/tests/debug/Test_heatbath_mobius_eofa_gparity.cc b/tests/debug/Test_heatbath_mobius_eofa_gparity.cc index 7ed3a308..52447e5e 100644 --- a/tests/debug/Test_heatbath_mobius_eofa_gparity.cc +++ b/tests/debug/Test_heatbath_mobius_eofa_gparity.cc @@ -73,13 +73,15 @@ int main(int argc, char** argv) std::vector seeds4({1, 2, 3, 4}); std::vector seeds5({5, 6, 7, 8}); GridParallelRNG RNG5(FGrid); + GridSerialRNG sRNG; RNG5.SeedFixedIntegers(seeds5); + sRNG.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); FermionAction::ImplParams params; FermionAction Lop(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, mf, mpv, 0.0, -1, M5, b, c, params); @@ -91,7 +93,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, false); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu, sRNG, RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } @@ -101,7 +103,7 @@ int main(int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, Params, true); - Meofa.refresh(Umu, RNG5); + Meofa.refresh(Umu, sRNG, RNG5); printf(" = %1.15e\n", Meofa.S(Umu)); } diff --git a/tests/debug/Test_reweight_dwf_eofa.cc b/tests/debug/Test_reweight_dwf_eofa.cc index 728fbf78..a150b18f 100644 --- a/tests/debug/Test_reweight_dwf_eofa.cc +++ b/tests/debug/Test_reweight_dwf_eofa.cc @@ -102,7 +102,7 @@ int main(int argc, char **argv) // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); // Initialize RHMC fermion operators DomainWallFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5); diff --git a/tests/debug/Test_reweight_dwf_eofa_gparity.cc b/tests/debug/Test_reweight_dwf_eofa_gparity.cc index fcc01b8d..df2d95a0 100644 --- a/tests/debug/Test_reweight_dwf_eofa_gparity.cc +++ b/tests/debug/Test_reweight_dwf_eofa_gparity.cc @@ -104,7 +104,7 @@ int main(int argc, char **argv) // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); // Initialize RHMC fermion operators GparityDomainWallFermionR::ImplParams params; diff --git a/tests/debug/Test_reweight_mobius_eofa.cc b/tests/debug/Test_reweight_mobius_eofa.cc index c5e46bcf..88ecab7d 100644 --- a/tests/debug/Test_reweight_mobius_eofa.cc +++ b/tests/debug/Test_reweight_mobius_eofa.cc @@ -104,7 +104,7 @@ int main(int argc, char **argv) // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); // Initialize RHMC fermion operators MobiusFermionR Ddwf_f(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, mf, M5, b, c); diff --git a/tests/debug/Test_reweight_mobius_eofa_gparity.cc b/tests/debug/Test_reweight_mobius_eofa_gparity.cc index bfc7543a..31708265 100644 --- a/tests/debug/Test_reweight_mobius_eofa_gparity.cc +++ b/tests/debug/Test_reweight_mobius_eofa_gparity.cc @@ -106,7 +106,7 @@ int main(int argc, char **argv) // Random gauge field LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); // Initialize RHMC fermion operators GparityDomainWallFermionR::ImplParams params; diff --git a/tests/forces/Test_contfrac_force.cc b/tests/forces/Test_contfrac_force.cc index cb30faad..dc9eedce 100644 --- a/tests/forces/Test_contfrac_force.cc +++ b/tests/forces/Test_contfrac_force.cc @@ -59,7 +59,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -93,7 +93,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_dwf_force.cc b/tests/forces/Test_dwf_force.cc index 81a1b8c4..e7d17347 100644 --- a/tests/forces/Test_dwf_force.cc +++ b/tests/forces/Test_dwf_force.cc @@ -60,7 +60,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -94,7 +94,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_dwf_force_eofa.cc b/tests/forces/Test_dwf_force_eofa.cc index 0b0ba346..525178d0 100644 --- a/tests/forces/Test_dwf_force_eofa.cc +++ b/tests/forces/Test_dwf_force_eofa.cc @@ -72,7 +72,7 @@ int main (int argc, char** argv) LatticeFermion MphiPrime (FGrid); LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -86,7 +86,9 @@ int main (int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, true); - Meofa.refresh(U, RNG5); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4); + Meofa.refresh(U, sRNG, RNG5 ); + RealD S = Meofa.S(U); // pdag M p // get the deriv of phidag M phi with respect to "U" @@ -105,7 +107,7 @@ int main (int argc, char** argv) for(int mu=0; mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom, mommu, mu); diff --git a/tests/forces/Test_dwf_gpforce.cc b/tests/forces/Test_dwf_gpforce.cc index b39fdd14..1fa1c6e4 100644 --- a/tests/forces/Test_dwf_gpforce.cc +++ b/tests/forces/Test_dwf_gpforce.cc @@ -63,8 +63,8 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); - // SU3::ColdConfiguration(pRNG,U); + SU::HotConfiguration(RNG4,U); + // SU::ColdConfiguration(pRNG,U); //////////////////////////////////// // Unmodified matrix element @@ -84,6 +84,13 @@ int main (int argc, char ** argv) GparityDomainWallFermionR::ImplParams params; params.twists = twists; + /* + params.boundary_phases[0] = 1.0; + params.boundary_phases[1] = 1.0; + params.boundary_phases[2] = 1.0; + params.boundary_phases[3] =- 1.0; + */ + GparityDomainWallFermionR Dw(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params); Dw.M (phi,Mphi); @@ -96,6 +103,16 @@ int main (int argc, char ** argv) Dw.MDeriv(tmp , Mphi, phi,DaggerNo ); UdSdU=tmp; Dw.MDeriv(tmp , phi, Mphi,DaggerYes ); UdSdU=(UdSdU+tmp); + + // ***************************************************************************************** + // *** There is a funny negative sign in all derivatives. This is - UdSdU. *** + // *** *** + // *** Deriv in both Wilson gauge action and the TwoFlavour.h seems to miss a minus sign *** + // *** UdSdU is negated relative to what I think - call what is returned mUdSdU, *** + // *** and insert minus sign *** + // ***************************************************************************************** + + UdSdU = - UdSdU ; // Follow sign convention of actions in Grid. Seems crazy. FermionField Ftmp (FGrid); @@ -106,18 +123,28 @@ int main (int argc, char ** argv) RealD Hmom = 0.0; RealD Hmomprime = 0.0; LatticeColourMatrix mommu(UGrid); - LatticeColourMatrix forcemu(UGrid); + LatticeColourMatrix mUdSdUmu(UGrid); LatticeGaugeField mom(UGrid); LatticeGaugeField Uprime(UGrid); for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg - Hmom -= real(sum(trace(mommu*mommu))); + // Momentum Hamiltonian is - trace(p^2)/HMC_MOM_DENOMINATOR + // + // Integrator.h: RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom // GaugeImplTypes.h: Hloc += trace(Pmu * Pmu); + // Sign comes from a sneaky multiply by "i" in GaussianFundemantalLie algebra + // P is i P^a_\mu T^a, not Pa Ta + // + // Integrator.h: H = Hmom + sum S(action) + Hmom -= real(sum(trace(mommu*mommu)))/ HMC_MOMENTUM_DENOMINATOR; PokeIndex(mom,mommu,mu); + // -- Drops factor of "i" in the U update: U' = e^{P dt} U [ _not_ e^{iPdt}U ]. P is anti hermitian already + // -- Udot = p U + // fourth order exponential approx autoView( mom_v, mom, CpuRead); autoView( U_v , U, CpuRead); @@ -134,8 +161,8 @@ int main (int argc, char ** argv) ; }); } - std::cout << GridLogMessage <<"Initial mom hamiltonian is "<< Hmom <(mom,mu); - std::cout << GridLogMessage<< " Mommu " << norm2(mommu)<(UdSdU,mu); - std::cout << GridLogMessage<< " dsdumu " << norm2(mommu)<(UdSdU,mu); - mommu=Ta(mommu)*2.0; + mommu=Ta(mommu); // projectForce , GaugeImplTypes.h PokeIndex(UdSdU,mommu,mu); } for(int mu=0;mu(mom,mu); - std::cout << GridLogMessage<< " Mommu " << norm2(mommu)<(UdSdU,mu); - std::cout << GridLogMessage<< " dsdumu " << norm2(mommu)<(UdSdU,mu); + mUdSdUmu= PeekIndex(UdSdU,mu); mommu = PeekIndex(mom,mu); - // Update PF action density - dS = dS+trace(mommu*forcemu)*dt; + // + // Derive HMC eom: + // + // Sdot = - 2 trace( p p^dot ) / D - trace( p [ mUdSdU - h.c. ] ) = 0 + // + // + // Sdot = 0 = - 2 trace( p p^dot ) / D - 2 trace( p Ta( mUdSdU ) = 0 + // + // EOM: + // + // pdot = - D Ta( mUdSdU ) -- source of sign is the "funny sign" above + // + // dSqcd_dt = - 2.0*trace(mommu* Ta(mUdSdU) )*dt -- i.e. mUdSdU with adjoint term -> force has a 2x implicit + // + // dH_mom/dt = - 2 trace (p pdot)/Denom + // + // dH_tot / dt = 0 <= pdot = - Denom * mUdSdU + // + // dH_mom/dt = 2 trace (p mUdSdU ) + // + // True Momentum delta H has a dt^2 piece + // + // dSmom = [ trace mom*mom - trace ( (mom-Denom*f*dt)(mom-Denom*f*dt) ) ] / Denom + // = 2*trace(mom*f) dt - Denom*dt*dt * trace(f*f). + // = dSmom + dSmom2 + // - dSmom = dSmom - trace(mommu*forcemu) * dt; - dSmom2 = dSmom2 - trace(forcemu*forcemu) *(0.25* dt*dt); + dS = dS - 2.0*trace(mommu*mUdSdUmu)*dt; // U and Udagger derivs hence 2x. - // Update mom action density - mommu = mommu + forcemu*(dt*0.5); + dSmom = dSmom + 2.0*trace(mommu*mUdSdUmu) * dt; // this 2.0 coms from derivative of p^2 + + dSmom2 = dSmom2 - trace(mUdSdUmu*mUdSdUmu) * dt*dt* HMC_MOMENTUM_DENOMINATOR; // Remnant - Hmomprime -= real(sum(trace(mommu*mommu))); + // Update mom action density . Verbatim update_P in Integrator.h + mommu = mommu - mUdSdUmu * dt* HMC_MOMENTUM_DENOMINATOR;; + + Hmomprime -= real(sum(trace(mommu*mommu))) / HMC_MOMENTUM_DENOMINATOR; } @@ -199,20 +233,25 @@ int main (int argc, char ** argv) ComplexD dSm = sum(dSmom); ComplexD dSm2 = sum(dSmom2); + std::cout << GridLogMessage <<"dSm "<< dSm<::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -90,7 +90,8 @@ int main (int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, true); - Meofa.refresh(U, RNG5); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4); + Meofa.refresh(U, sRNG, RNG5); RealD S = Meofa.S(U); // pdag M p // get the deriv of phidag M phi with respect to "U" @@ -109,7 +110,7 @@ int main (int argc, char** argv) for(int mu=0; mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom, mommu, mu); diff --git a/tests/forces/Test_gp_plaq_force.cc b/tests/forces/Test_gp_plaq_force.cc index 21f0b9d0..bc2b5b26 100644 --- a/tests/forces/Test_gp_plaq_force.cc +++ b/tests/forces/Test_gp_plaq_force.cc @@ -51,7 +51,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(&Grid); - SU3::HotConfiguration(pRNG,U); + SU::HotConfiguration(pRNG,U); double beta = 1.0; ConjugateWilsonGaugeActionR Action(beta); @@ -80,7 +80,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_gp_rect_force.cc b/tests/forces/Test_gp_rect_force.cc index bb4ea6de..e277ea6b 100644 --- a/tests/forces/Test_gp_rect_force.cc +++ b/tests/forces/Test_gp_rect_force.cc @@ -29,7 +29,6 @@ Author: paboyle using namespace std; using namespace Grid; - ; @@ -54,11 +53,15 @@ int main (int argc, char ** argv) LatticeGaugeField U(&Grid); - SU3::HotConfiguration(pRNG,U); + SU::HotConfiguration(pRNG,U); double beta = 1.0; double c1 = 0.331; + const int nu = 1; + std::vector twists(Nd,0); + twists[nu] = 1; + ConjugateGimplD::setDirections(twists); ConjugatePlaqPlusRectangleActionR Action(beta,c1); //ConjugateWilsonGaugeActionR Action(beta); //WilsonGaugeActionR Action(beta); @@ -82,7 +85,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_gpdwf_force.cc b/tests/forces/Test_gpdwf_force.cc index bdc332d9..d6744080 100644 --- a/tests/forces/Test_gpdwf_force.cc +++ b/tests/forces/Test_gpdwf_force.cc @@ -63,7 +63,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -100,7 +100,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_gpwilson_force.cc b/tests/forces/Test_gpwilson_force.cc index 1c85a5d9..d731f27a 100644 --- a/tests/forces/Test_gpwilson_force.cc +++ b/tests/forces/Test_gpwilson_force.cc @@ -57,7 +57,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -94,7 +94,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_laplacian_force.cc b/tests/forces/Test_laplacian_force.cc index 639378dc..dbaf1cbd 100644 --- a/tests/forces/Test_laplacian_force.cc +++ b/tests/forces/Test_laplacian_force.cc @@ -46,6 +46,7 @@ int main (int argc, char ** argv) std::vector seeds({1,2,3,4}); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers({4,5,6,7}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector({15,91,21,3})); @@ -58,7 +59,7 @@ int main (int argc, char ** argv) PokeIndex(P, P_mu, mu); } - SU3::HotConfiguration(pRNG,U); + SU::HotConfiguration(pRNG,U); ConjugateGradient CG(1.0e-8, 10000); @@ -67,7 +68,7 @@ int main (int argc, char ** argv) LaplacianAdjointField Laplacian(&Grid, CG, LapPar, Kappa); GeneralisedMomenta LaplacianMomenta(&Grid, Laplacian); LaplacianMomenta.M.ImportGauge(U); - LaplacianMomenta.MomentaDistribution(pRNG);// fills the Momenta with the correct distr + LaplacianMomenta.MomentaDistribution(sRNG,pRNG);// fills the Momenta with the correct distr std::cout << std::setprecision(15); @@ -95,7 +96,7 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "Update the U " << std::endl; for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); auto Umu = PeekIndex(U, mu); PokeIndex(mom,mommu,mu); Umu = expMat(mommu, dt, 12) * Umu; diff --git a/tests/forces/Test_mobius_force.cc b/tests/forces/Test_mobius_force.cc index 11e69652..d2326a81 100644 --- a/tests/forces/Test_mobius_force.cc +++ b/tests/forces/Test_mobius_force.cc @@ -60,7 +60,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -69,7 +69,14 @@ int main (int argc, char ** argv) RealD M5=1.8; RealD b=0.5; RealD c=0.5; - MobiusFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c); + + WilsonImplParams p; + p.boundary_phases[0] = 1.0; + p.boundary_phases[1] = 1.0; + p.boundary_phases[2] = 1.0; + p.boundary_phases[3] =- 1.0; + + MobiusFermionR Ddwf(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,p); Ddwf.M (phi,Mphi); ComplexD S = innerProduct(Mphi,Mphi); // pdag MdagM p @@ -82,24 +89,44 @@ int main (int argc, char ** argv) Ddwf.MDeriv(tmp , Mphi, phi,DaggerNo ); UdSdU=tmp; Ddwf.MDeriv(tmp , phi, Mphi,DaggerYes ); UdSdU=(UdSdU+tmp); + // ***************************************************************************************** + // *** There is a funny negative sign in all derivatives. This is - UdSdU. *** + // *** *** + // *** Deriv in both Wilson gauge action and the TwoFlavour.h seems to miss a minus sign *** + // *** UdSdU is negated relative to what I think - call what is returned mUdSdU, *** + // *** and insert minus sign *** + // ***************************************************************************************** + + UdSdU = - UdSdU ; // Follow sign convention of actions in Grid. Seems crazy. + LatticeFermion Ftmp (FGrid); //////////////////////////////////// // Modify the gauge field a little //////////////////////////////////// - RealD dt = 0.0001; + RealD dt = 0.001; + RealD Hmom = 0.0; + RealD Hmomprime = 0.0; LatticeColourMatrix mommu(UGrid); - LatticeColourMatrix forcemu(UGrid); + LatticeColourMatrix mUdSdUmu(UGrid); LatticeGaugeField mom(UGrid); LatticeGaugeField Uprime(UGrid); for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); + // Momentum Hamiltonian is - trace(p^2)/HMC_MOM_DENOMINATOR + // + // Integrator.h: RealD H = - FieldImplementation::FieldSquareNorm(P)/HMC_MOMENTUM_DENOMINATOR; // - trace (P*P)/denom // GaugeImplTypes.h: Hloc += trace(Pmu * Pmu); + // Sign comes from a sneaky multiply by "i" in GaussianFundemantalLie algebra + // P is i P^a_\mu T^a, not Pa Ta + // + // Integrator.h: H = Hmom + sum S(action) + Hmom -= real(sum(trace(mommu*mommu)))/ HMC_MOMENTUM_DENOMINATOR; + // fourth order exponential approx autoView( U_v , U, CpuRead); autoView( mom_v, mom, CpuRead); @@ -115,6 +142,7 @@ int main (int argc, char ** argv) ; }); } + std::cout << GridLogMessage <<"Initial mom hamiltonian is "<< Hmom <(UdSdU,mu); - mommu=Ta(mommu)*2.0; + mommu=Ta(mommu); PokeIndex(UdSdU,mommu,mu); } for(int mu=0;mu(UdSdU,mu); + + mUdSdUmu= PeekIndex(UdSdU,mu); mommu = PeekIndex(mom,mu); - // Update PF action density - dS = dS+trace(mommu*forcemu)*dt; + // + // Derive HMC eom: + // + // Sdot = - 2 trace( p p^dot ) / D - trace( p [ mUdSdU - h.c. ] ) = 0 + // + // + // Sdot = 0 = - 2 trace( p p^dot ) / D - 2 trace( p Ta( mUdSdU ) = 0 + // + // EOM: + // + // pdot = - D Ta( mUdSdU ) -- source of sign is the "funny sign" above + // + // dSqcd_dt = - 2.0*trace(mommu* Ta(mUdSdU) )*dt -- i.e. mUdSdU with adjoint term -> force has a 2x implicit + // + // dH_mom/dt = - 2 trace (p pdot)/Denom + // + // dH_tot / dt = 0 <= pdot = - Denom * mUdSdU + // + // dH_mom/dt = 2 trace (p mUdSdU ) + // + // True Momentum delta H has a dt^2 piece + // + // dSmom = [ trace mom*mom - trace ( (mom-Denom*f*dt)(mom-Denom*f*dt) ) ] / Denom + // = 2*trace(mom*f) dt - Denom*dt*dt * trace(f*f). + // = dSmom + dSmom2 + // + + dS = dS - 2.0*trace(mommu*mUdSdUmu)*dt; // U and Udagger derivs hence 2x. + + dSmom = dSmom + 2.0*trace(mommu*mUdSdUmu) * dt; // this 2.0 coms from derivative of p^2 + + dSmom2 = dSmom2 - trace(mUdSdUmu*mUdSdUmu) * dt*dt* HMC_MOMENTUM_DENOMINATOR; // Remnant + + mommu = mommu - mUdSdUmu * dt* HMC_MOMENTUM_DENOMINATOR;; + + Hmomprime -= real(sum(trace(mommu*mommu))) / HMC_MOMENTUM_DENOMINATOR; } ComplexD dSpred = sum(dS); + ComplexD dSm = sum(dSmom); + ComplexD dSm2 = sum(dSmom2); - std::cout << GridLogMessage << " -- S "<::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -88,7 +88,8 @@ int main (int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, false); - Meofa.refresh(U, RNG5); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4); + Meofa.refresh(U, sRNG, RNG5 ); RealD S = Meofa.S(U); // pdag M p // get the deriv of phidag M phi with respect to "U" @@ -107,7 +108,7 @@ int main (int argc, char** argv) for(int mu=0; mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom, mommu, mu); diff --git a/tests/forces/Test_mobius_gpforce_eofa.cc b/tests/forces/Test_mobius_gpforce_eofa.cc index 68163e63..7f114615 100644 --- a/tests/forces/Test_mobius_gpforce_eofa.cc +++ b/tests/forces/Test_mobius_gpforce_eofa.cc @@ -76,7 +76,7 @@ int main (int argc, char** argv) FermionField MphiPrime (FGrid); LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -93,7 +93,8 @@ int main (int argc, char** argv) ConjugateGradient CG(1.0e-12, 5000); ExactOneFlavourRatioPseudoFermionAction Meofa(Lop, Rop, CG, CG, CG, CG, CG, Params, false); - Meofa.refresh(U, RNG5); + GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4); + Meofa.refresh(U, sRNG, RNG5 ); RealD S = Meofa.S(U); // pdag M p // get the deriv of phidag M phi with respect to "U" @@ -112,7 +113,7 @@ int main (int argc, char** argv) for(int mu=0; mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom, mommu, mu); autoView( U_v , U, CpuRead); diff --git a/tests/forces/Test_momentum_filter.cc b/tests/forces/Test_momentum_filter.cc new file mode 100644 index 00000000..794b5fa0 --- /dev/null +++ b/tests/forces/Test_momentum_filter.cc @@ -0,0 +1,156 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_wilson_force.cc + + Copyright (C) 2015 + +Author: Christopher Kelly + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +//Get the mu-directected links on the upper boundary and the bulk remainder +template +void getLinksBoundaryBulk(Field &bound, Field &bulk, Field &from, const Coordinate &latt_size){ + bound = Zero(); bulk = Zero(); + for(int mu=0;mu seeds({1,2,3,4}); + + GridParallelRNG pRNG(&Grid); + GridSerialRNG sRNG; + pRNG.SeedFixedIntegers(seeds); + sRNG.SeedFixedIntegers(seeds); + + typedef PeriodicGimplR Gimpl; + typedef WilsonGaugeAction GaugeAction; + typedef NoHirep Representation; //fundamental + typedef NoSmearing Smearing; + typedef MinimumNorm2 Omelyan; + typedef Gimpl::Field Field; + typedef MomentumFilterApplyPhase Filter; + Filter filter(&Grid); + + //Setup a filter that disables link update on links passing through the global lattice boundary + typedef Filter::LatticeLorentzScalarType MaskType; + typedef Filter::LorentzScalarType MaskSiteType; + + MaskSiteType zero, one; + for(int mu=0;mu::HotConfiguration(pRNG,U); + + //Get the original links on the bulk and boundary for later use + Field Ubnd_orig(&Grid), Ubulk_orig(&Grid); + getLinksBoundaryBulk(Ubnd_orig, Ubulk_orig, U, latt_size); + + ActionSet actions(1); + double beta=6; + GaugeAction gauge_action(beta); + actions[0].push_back(&gauge_action); + + Smearing smear; + IntegratorParameters params(1,1.); //1 MD step + Omelyan integrator(&Grid, params, actions, smear); + + integrator.setMomentumFilter(filter); + + integrator.refresh(U, sRNG, pRNG); //doesn't actually change the gauge field + + //Check the momentum is zero on the boundary + const auto &P = integrator.getMomentum(); + Field Pbnd(&Grid), Pbulk(&Grid); + getLinksBoundaryBulk(Pbnd, Pbulk, const_cast(P), latt_size); + + RealD Pbnd_nrm = norm2(Pbnd); //expect zero + std::cout << GridLogMessage << "After refresh, norm2 of mu-directed conjugate momentum on boundary is: " << Pbnd_nrm << " (expect 0)" << std::endl; + RealD Pbulk_nrm = norm2(Pbulk); //expect non-zero + std::cout << GridLogMessage << "After refresh, norm2 of bulk conjugate momentum is: " << Pbulk_nrm << " (expect non-zero)" << std::endl; + + //Evolve the gauge field + integrator.integrate(U); + + //Check momentum is still zero on boundary + getLinksBoundaryBulk(Pbnd, Pbulk, const_cast(P), latt_size); + + Pbnd_nrm = norm2(Pbnd); //expect zero + std::cout << GridLogMessage << "After integrate, norm2 of mu-directed conjugate momentum on boundary is: " << Pbnd_nrm << " (expect 0)" << std::endl; + Pbulk_nrm = norm2(Pbulk); //expect non-zero + std::cout << GridLogMessage << "After integrate, norm2 of bulk conjugate momentum is: " << Pbulk_nrm << " (expect non-zero)" << std::endl; + + //Get the new bulk and bound links + Field Ubnd_new(&Grid), Ubulk_new(&Grid); + getLinksBoundaryBulk(Ubnd_new, Ubulk_new, U, latt_size); + + Field Ubnd_diff = Ubnd_new - Ubnd_orig; + Field Ubulk_diff = Ubulk_new - Ubulk_orig; + + RealD Ubnd_change = norm2( Ubnd_diff ); + RealD Ubulk_change = norm2( Ubulk_diff ); + std::cout << GridLogMessage << "After integrate, norm2 of change in mu-directed boundary links is : " << Ubnd_change << " (expect 0)" << std::endl; + std::cout << GridLogMessage << "After integrate, norm2 of change in bulk links is : " << Ubulk_change << " (expect non-zero)" << std::endl; + + Grid_finalize(); +} diff --git a/tests/forces/Test_partfrac_force.cc b/tests/forces/Test_partfrac_force.cc index 17dce530..33f7b5fd 100644 --- a/tests/forces/Test_partfrac_force.cc +++ b/tests/forces/Test_partfrac_force.cc @@ -62,7 +62,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -96,7 +96,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_rect_force.cc b/tests/forces/Test_rect_force.cc index ed72f2c0..c9326f8d 100644 --- a/tests/forces/Test_rect_force.cc +++ b/tests/forces/Test_rect_force.cc @@ -54,7 +54,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(&Grid); - SU3::HotConfiguration(pRNG,U); + SU::HotConfiguration(pRNG,U); double beta = 1.0; double c1 = -0.331; @@ -82,7 +82,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/forces/Test_wilson_force.cc b/tests/forces/Test_wilson_force.cc index c8b3a7f4..b7bf1268 100644 --- a/tests/forces/Test_wilson_force.cc +++ b/tests/forces/Test_wilson_force.cc @@ -61,7 +61,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(&Grid); //SU2::HotConfiguration(pRNG,U); - SU3::ColdConfiguration(pRNG,U); + SU::ColdConfiguration(pRNG,U); //////////////////////////////////// // Unmodified matrix element @@ -98,7 +98,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); Hmom -= real(sum(trace(mommu*mommu))); diff --git a/tests/forces/Test_wilsonclover_force.cc b/tests/forces/Test_wilsonclover_force.cc index f26f0ac9..6a28e4e2 100644 --- a/tests/forces/Test_wilsonclover_force.cc +++ b/tests/forces/Test_wilsonclover_force.cc @@ -62,8 +62,8 @@ int main(int argc, char **argv) LatticeGaugeField U(&Grid); - SU3::HotConfiguration(pRNG, U); - //SU3::ColdConfiguration(pRNG, U);// Clover term Zero() + SU::HotConfiguration(pRNG, U); + //SU::ColdConfiguration(pRNG, U);// Clover term Zero() //////////////////////////////////// // Unmodified matrix element @@ -101,7 +101,7 @@ int main(int argc, char **argv) for (int mu = 0; mu < Nd; mu++) { // Traceless antihermitian momentum; gaussian in lie alg - SU3::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); + SU::GaussianFundamentalLieAlgebraMatrix(pRNG, mommu); Hmom -= real(sum(trace(mommu * mommu))); PokeIndex(mom, mommu, mu); diff --git a/tests/forces/Test_zmobius_force.cc b/tests/forces/Test_zmobius_force.cc index e24ae601..89673bc7 100644 --- a/tests/forces/Test_zmobius_force.cc +++ b/tests/forces/Test_zmobius_force.cc @@ -59,7 +59,7 @@ int main (int argc, char ** argv) LatticeGaugeField U(UGrid); - SU3::HotConfiguration(RNG4,U); + SU::HotConfiguration(RNG4,U); //////////////////////////////////// // Unmodified matrix element @@ -109,7 +109,7 @@ int main (int argc, char ** argv) for(int mu=0;mu::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg PokeIndex(mom,mommu,mu); diff --git a/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc b/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc index 3434fccc..9ca0b0a0 100644 --- a/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc +++ b/tests/hmc/Test_hmc_EODWFRatio_Gparity.cc @@ -81,6 +81,10 @@ int main(int argc, char **argv) { // that have a complex construction // standard RealD beta = 5.6 ; + const int nu = 3; + std::vector twists(Nd,0); + twists[nu] = 1; + ConjugateGimplD::setDirections(twists); ConjugateWilsonGaugeActionR Waction(beta); const int Ls = 8; @@ -93,9 +97,6 @@ int main(int argc, char **argv) { // temporarily need a gauge field LatticeGaugeField U(GridPtr); - const int nu = 3; - std::vector twists(Nd,0); - twists[nu] = 1; FermionAction::ImplParams params; params.twists = twists; Real mass=0.04; diff --git a/tests/hmc/Test_hmc_GparityIwasakiGauge.cc b/tests/hmc/Test_hmc_GparityIwasakiGauge.cc index bc47b6c2..7f74d5d8 100644 --- a/tests/hmc/Test_hmc_GparityIwasakiGauge.cc +++ b/tests/hmc/Test_hmc_GparityIwasakiGauge.cc @@ -79,6 +79,10 @@ int main(int argc, char **argv) { // that have a complex construction // standard RealD beta = 2.6 ; + const int nu = 3; + std::vector twists(Nd,0); + twists[nu] = 1; + ConjugateGimplD::setDirections(twists); ConjugateIwasakiGaugeActionR Waction(beta); diff --git a/tests/hmc/Test_hmc_GparityWilsonGauge.cc b/tests/hmc/Test_hmc_GparityWilsonGauge.cc index eb057181..b8c078fe 100644 --- a/tests/hmc/Test_hmc_GparityWilsonGauge.cc +++ b/tests/hmc/Test_hmc_GparityWilsonGauge.cc @@ -80,6 +80,9 @@ int main(int argc, char **argv) { // that have a complex construction // standard RealD beta = 5.6 ; + std::vector twists(Nd,0); + twists[3] = 1; + ConjugateGimplD::setDirections(twists); ConjugateWilsonGaugeActionR Waction(beta); diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc index d9249e0d..3766e069 100644 --- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc +++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg_synthetic.cc @@ -293,7 +293,7 @@ int main (int argc, char ** argv) { { std::vector seeds4({1,2,3,4}); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); } std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << " Ls: " << Ls << std::endl; diff --git a/tests/lanczos/Test_dwf_lanczos.cc b/tests/lanczos/Test_dwf_lanczos.cc index 12283921..00d29ec0 100644 --- a/tests/lanczos/Test_dwf_lanczos.cc +++ b/tests/lanczos/Test_dwf_lanczos.cc @@ -54,7 +54,7 @@ int main (int argc, char ** argv) GridParallelRNG RNG5rb(FrbGrid); RNG5.SeedFixedIntegers(seeds5); LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); std::vector U(4,UGrid); for(int mu=0;mu::HotConfiguration(RNG4, Umu); /* std::vector U(4, UGrid); diff --git a/tests/qdpxx/Test_qdpxx_baryon.cc b/tests/qdpxx/Test_qdpxx_baryon.cc index a1d8f738..d8225f82 100644 --- a/tests/qdpxx/Test_qdpxx_baryon.cc +++ b/tests/qdpxx/Test_qdpxx_baryon.cc @@ -280,7 +280,7 @@ void make_gauge(GaugeField &Umu, Grid::LatticePropagator &q1,Grid::LatticePropag Grid::GridCartesian *UGrid = (Grid::GridCartesian *)Umu.Grid(); Grid::GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - Grid::SU3::HotConfiguration(RNG4, Umu); + Grid::SU::HotConfiguration(RNG4, Umu); // Propagator Grid::gaussian(RNG4, q1); diff --git a/tests/qdpxx/Test_qdpxx_loops_staples.cc b/tests/qdpxx/Test_qdpxx_loops_staples.cc index bbb41f4e..33057eeb 100644 --- a/tests/qdpxx/Test_qdpxx_loops_staples.cc +++ b/tests/qdpxx/Test_qdpxx_loops_staples.cc @@ -277,7 +277,7 @@ double calc_grid_p(Grid::LatticeGaugeField & Umu) Grid::GridCartesian * UGrid = (Grid::GridCartesian *) Umu.Grid(); Grid::GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - Grid::SU3::HotConfiguration(RNG4,Umu); + Grid::SU::HotConfiguration(RNG4,Umu); Grid::LatticeColourMatrix tmp(UGrid); tmp = Grid::zero; diff --git a/tests/qdpxx/Test_qdpxx_munprec.cc b/tests/qdpxx/Test_qdpxx_munprec.cc index fbc1ec82..82874546 100644 --- a/tests/qdpxx/Test_qdpxx_munprec.cc +++ b/tests/qdpxx/Test_qdpxx_munprec.cc @@ -502,7 +502,7 @@ void calc_grid(ChromaAction action,Grid::LatticeGaugeField & Umu, Grid::LatticeF Grid::gaussian(RNG5,src); Grid::gaussian(RNG5,res); - Grid::SU3::HotConfiguration(RNG4,Umu); + Grid::SU::HotConfiguration(RNG4,Umu); /* Grid::LatticeColourMatrix U(UGrid); diff --git a/tests/qdpxx/Test_qdpxx_stag.cc b/tests/qdpxx/Test_qdpxx_stag.cc index f283d5a9..8f81fa99 100644 --- a/tests/qdpxx/Test_qdpxx_stag.cc +++ b/tests/qdpxx/Test_qdpxx_stag.cc @@ -333,7 +333,7 @@ void make_gauge(GaugeField & Umu,FermionField &src) Grid::GridCartesian * UGrid = (Grid::GridCartesian *) Umu.Grid(); Grid::GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - Grid::SU3::HotConfiguration(RNG4,Umu); + Grid::SU::HotConfiguration(RNG4,Umu); Grid::gaussian(RNG4,src); } diff --git a/tests/qdpxx/Test_qdpxx_wilson.cc b/tests/qdpxx/Test_qdpxx_wilson.cc index fdf59982..8ce28dca 100644 --- a/tests/qdpxx/Test_qdpxx_wilson.cc +++ b/tests/qdpxx/Test_qdpxx_wilson.cc @@ -348,7 +348,7 @@ void make_gauge(GaugeField &Umu, FermionField &src) Grid::GridCartesian *UGrid = (Grid::GridCartesian *)Umu._grid; Grid::GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - Grid::SU3::HotConfiguration(RNG4, Umu); + Grid::SU::HotConfiguration(RNG4, Umu); // Fermion field Grid::gaussian(RNG4, src); diff --git a/tests/smearing/Test_smearing.cc b/tests/smearing/Test_smearing.cc index c1c7c457..adab1c6e 100644 --- a/tests/smearing/Test_smearing.cc +++ b/tests/smearing/Test_smearing.cc @@ -47,8 +47,8 @@ int main (int argc, char ** argv) RealD nrm = norm2(src); LatticeFermion result(&Grid); result=Zero(); LatticeGaugeField Umu(&Grid); - // SU3::HotConfiguration(pRNG,Umu); - SU3::ColdConfiguration(Umu); + // SU::HotConfiguration(pRNG,Umu); + SU::ColdConfiguration(Umu); std::vector U(4,&Grid); for(int mu=0;mu::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); for(int mu=0;mu + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ +/* END LEGAL */ + +#include + +using namespace Grid; + +#ifndef NBASIS +#define NBASIS 40 +#endif + +// NOTE: The tests in this file are written in analogy to +// - tests/core/Test_wilson_even_odd.cc +// - tests/core/Test_wilson_clover.cc + +std::vector readFromCommandlineIvec(int* argc, + char*** argv, + std::string&& option, + const std::vector& defaultValue) { + std::string arg; + std::vector ret(defaultValue); + if(GridCmdOptionExists(*argv, *argv + *argc, option)) { + arg = GridCmdOptionPayload(*argv, *argv + *argc, option); + GridCmdOptionIntVector(arg, ret); + } + return ret; +} + +int main(int argc, char** argv) { + Grid_init(&argc, &argv); + + ///////////////////////////////////////////////////////////////////////////// + // Read from command line // + ///////////////////////////////////////////////////////////////////////////// + + const int nbasis = NBASIS; static_assert((nbasis & 0x1) == 0, ""); + const int nb = nbasis/2; + Coordinate blockSize = readFromCommandlineIvec(&argc, &argv, "--blocksize", {2, 2, 2, 2}); + + std::cout << GridLogMessage << "Compiled with nbasis = " << nbasis << " -> nb = " << nb << std::endl; + + ///////////////////////////////////////////////////////////////////////////// + // General setup // + ///////////////////////////////////////////////////////////////////////////// + + Coordinate clatt = GridDefaultLatt(); + for(int d=0; dshow_decomposition(); + std::cout << GridLogMessage << "Grid_c:" << std::endl; Grid_c->show_decomposition(); + std::cout << GridLogMessage << "RBGrid_f:" << std::endl; RBGrid_f->show_decomposition(); + std::cout << GridLogMessage << "RBGrid_c:" << std::endl; RBGrid_c->show_decomposition(); + + GridParallelRNG pRNG_f(Grid_f); + GridParallelRNG pRNG_c(Grid_c); + + std::vector seeds({1, 2, 3, 4}); + + pRNG_f.SeedFixedIntegers(seeds); + pRNG_c.SeedFixedIntegers(seeds); + + ///////////////////////////////////////////////////////////////////////////// + // Setup of Dirac Matrix and Operator // + ///////////////////////////////////////////////////////////////////////////// + + LatticeGaugeField Umu(Grid_f); SU3::HotConfiguration(pRNG_f, Umu); + + RealD checkTolerance = (getPrecision::value == 1) ? 1e-7 : 1e-15; + + RealD mass = -0.30; + RealD csw = 1.9192; + + WilsonCloverFermionR Dwc(Umu, *Grid_f, *RBGrid_f, mass, csw, csw); + MdagMLinearOperator MdagMOp_Dwc(Dwc); + + ///////////////////////////////////////////////////////////////////////////// + // Type definitions // + ///////////////////////////////////////////////////////////////////////////// + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseDiracMatrix; + typedef CoarseDiracMatrix::CoarseVector CoarseVector; + + ///////////////////////////////////////////////////////////////////////////// + // Setup of Aggregation // + ///////////////////////////////////////////////////////////////////////////// + + Aggregates Aggs(Grid_c, Grid_f, 0); + { + LatticeFermion tmp(Aggs.subspace[0].Grid()); + for(int n = 0; n < nb; n++) { + gaussian(pRNG_f, Aggs.subspace[n]); + G5C(tmp, Aggs.subspace[n]); + axpby(Aggs.subspace[n + nb], 0.5, -0.5, Aggs.subspace[n], tmp); + axpby(Aggs.subspace[n], 0.5, 0.5, Aggs.subspace[n], tmp); + } + } + + ///////////////////////////////////////////////////////////////////////////// + // Setup of CoarsenedMatrix and Operator // + ///////////////////////////////////////////////////////////////////////////// + + const int hermitian = 0; + CoarseDiracMatrix Dc(*Grid_c, *RBGrid_c, hermitian); + Dc.CoarsenOperator(Grid_f, MdagMOp_Dwc, Aggs); + MdagMLinearOperator MdagMOp_Dc(Dc); + + ///////////////////////////////////////////////////////////////////////////// + // Setup vectors used in all tests // + ///////////////////////////////////////////////////////////////////////////// + + CoarseVector src(Grid_c); random(pRNG_c, src); + CoarseVector diff(Grid_c); diff = Zero(); + + ///////////////////////////////////////////////////////////////////////////// + // Start of tests // + ///////////////////////////////////////////////////////////////////////////// + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Dhop + Mdiag = Munprec" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi(Grid_c); chi = Zero(); + CoarseVector res(Grid_c); res = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + + Dc.Mdiag(src, phi); std::cout << GridLogMessage << "Applied Mdiag" << std::endl; + Dc.Dhop(src, chi, DaggerNo); std::cout << GridLogMessage << "Applied Dhop" << std::endl; + Dc.M(src, ref); std::cout << GridLogMessage << "Applied M" << std::endl; + + res = phi + chi; + + diff = ref - res; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Munprec), norm2(Dhop + Mdiag), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(res) << " " << absDev << " " << relDev << " -> check " + << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Meo + Moe = Dhop" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector src_e(RBGrid_c); src_e = Zero(); + CoarseVector src_o(RBGrid_c); src_o = Zero(); + CoarseVector res_e(RBGrid_c); res_e = Zero(); + CoarseVector res_o(RBGrid_c); res_o = Zero(); + CoarseVector res(Grid_c); res = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd, src_o, src); + + Dc.Meooe(src_e, res_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + Dc.Meooe(src_o, res_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.Dhop(src, ref, DaggerNo); std::cout << GridLogMessage << "Applied Dhop" << std::endl; + + setCheckerboard(res, res_o); + setCheckerboard(res, res_e); + + diff = ref - res; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Dhop), norm2(Meo + Moe), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(res) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test |(Im(v^dag M^dag M v)| = 0" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.M(src, tmp); std::cout << GridLogMessage << "Applied M" << std::endl; + Dc.Mdag(tmp, phi); std::cout << GridLogMessage << "Applied Mdag" << std::endl; + + std::cout << GridLogMessage << "src = " << norm2(src) << " tmp = " << norm2(tmp) << " phi = " << norm2(phi) << std::endl; + + ComplexD dot = innerProduct(src, phi); + + auto relDev = abs(imag(dot)) / abs(real(dot)); + std::cout << GridLogMessage << "Re(v^dag M^dag M v), Im(v^dag M^dag M v), rel.deviation: " + << real(dot) << " " << imag(dot) << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test |(Im(v^dag Mooee^dag Mooee v)| = 0 (full grid)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.Mooee(src, tmp); std::cout << GridLogMessage << "Applied Mooee" << std::endl; + Dc.MooeeDag(tmp, phi); std::cout << GridLogMessage << "Applied MooeeDag" << std::endl; + + ComplexD dot = innerProduct(src, phi); + + auto relDev = abs(imag(dot)) / abs(real(dot)); + std::cout << GridLogMessage << "Re(v^dag Mooee^dag Mooee v), Im(v^dag Mooee^dag Mooee v), rel.deviation: " + << real(dot) << " " << imag(dot) << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeInv Mooee = 1 (full grid)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + + Dc.Mooee(src, tmp); std::cout << GridLogMessage << "Applied Mooee" << std::endl; + Dc.MooeeInv(tmp, phi); std::cout << GridLogMessage << "Applied MooeeInv" << std::endl; + + diff = src - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(src); + std::cout << GridLogMessage << "norm2(src), norm2(MooeeInv Mooee src), abs. deviation, rel. deviation: " + << norm2(src) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MeooeDagger is the dagger of Meooe by requiring" << std::endl; + std::cout << GridLogMessage << "= < phi | Meooe | chi > * = < chi | Meooe^dag| phi>" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + // clang-format off + CoarseVector phi(Grid_c); random(pRNG_c, phi); + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector dchi_e(RBGrid_c); dchi_e = Zero(); + CoarseVector dchi_o(RBGrid_c); dchi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector dphi_e(RBGrid_c); dphi_e = Zero(); + CoarseVector dphi_o(RBGrid_c); dphi_o = Zero(); + // clang-format on + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + + Dc.Meooe(chi_e, dchi_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + Dc.Meooe(chi_o, dchi_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.MeooeDag(phi_e, dphi_o); std::cout << GridLogMessage << "Applied MeoDag" << std::endl; + Dc.MeooeDag(phi_o, dphi_e); std::cout << GridLogMessage << "Applied MoeDag" << std::endl; + + ComplexD phiDchi_e = innerProduct(phi_e, dchi_e); + ComplexD phiDchi_o = innerProduct(phi_o, dchi_o); + ComplexD chiDphi_e = innerProduct(chi_e, dphi_e); + ComplexD chiDphi_o = innerProduct(chi_o, dphi_o); + + std::cout << GridLogDebug << "norm dchi_e = " << norm2(dchi_e) << " norm dchi_o = " << norm2(dchi_o) << " norm dphi_e = " << norm2(dphi_e) + << " norm dphi_o = " << norm2(dphi_e) << std::endl; + + std::cout << GridLogMessage << "e " << phiDchi_e << " " << chiDphi_e << std::endl; + std::cout << GridLogMessage << "o " << phiDchi_o << " " << chiDphi_o << std::endl; + + std::cout << GridLogMessage << "phiDchi_e - conj(chiDphi_o) " << phiDchi_e - conj(chiDphi_o) << std::endl; + std::cout << GridLogMessage << "phiDchi_o - conj(chiDphi_e) " << phiDchi_o - conj(chiDphi_e) << std::endl; + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeInv Mooee = 1 (checkerboards separately)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector tmp_e(RBGrid_c); tmp_e = Zero(); + CoarseVector tmp_o(RBGrid_c); tmp_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, tmp_e, tmp); + pickCheckerboard(Odd, tmp_o, tmp); + + Dc.Mooee(chi_e, tmp_e); std::cout << GridLogMessage << "Applied Mee" << std::endl; + Dc.MooeeInv(tmp_e, phi_e); std::cout << GridLogMessage << "Applied MeeInv" << std::endl; + Dc.Mooee(chi_o, tmp_o); std::cout << GridLogMessage << "Applied Moo" << std::endl; + Dc.MooeeInv(tmp_o, phi_o); std::cout << GridLogMessage << "Applied MooInv" << std::endl; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + diff = chi - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(chi); + std::cout << GridLogMessage << "norm2(chi), norm2(MeeInv Mee chi), abs. deviation, rel. deviation: " + << norm2(chi) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MooeeDag MooeeInvDag = 1 (checkerboards separately)" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector tmp(Grid_c); tmp = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector tmp_e(RBGrid_c); tmp_e = Zero(); + CoarseVector tmp_o(RBGrid_c); tmp_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, tmp_e, tmp); + pickCheckerboard(Odd, tmp_o, tmp); + + Dc.MooeeDag(chi_e, tmp_e); std::cout << GridLogMessage << "Applied MeeDag" << std::endl; + Dc.MooeeInvDag(tmp_e, phi_e); std::cout << GridLogMessage << "Applied MeeInvDag" << std::endl; + Dc.MooeeDag(chi_o, tmp_o); std::cout << GridLogMessage << "Applied MooDag" << std::endl; + Dc.MooeeInvDag(tmp_o, phi_o); std::cout << GridLogMessage << "Applied MooInvDag" << std::endl; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + diff = chi - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(chi); + std::cout << GridLogMessage << "norm2(chi), norm2(MeeDag MeeInvDag chi), abs. deviation, rel. deviation: " + << norm2(chi) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test Meo + Moe + Moo + Mee = Munprec" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector chi(Grid_c); chi = Zero(); + CoarseVector phi(Grid_c); phi = Zero(); + CoarseVector ref(Grid_c); ref = Zero(); + CoarseVector src_e(RBGrid_c); src_e = Zero(); + CoarseVector src_o(RBGrid_c); src_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + + pickCheckerboard(Even, src_e, src); + pickCheckerboard(Odd, src_o, src); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + + // M phi = (Mooee src_e + Meooe src_o , Mooee src_o + Meooe src_e) + + Dc.M(src, ref); // Reference result from the unpreconditioned operator + + // EO matrix + Dc.Mooee(src_e, chi_e); std::cout << GridLogMessage << "Applied Mee" << std::endl; + Dc.Mooee(src_o, chi_o); std::cout << GridLogMessage << "Applied Moo" << std::endl; + Dc.Meooe(src_o, phi_e); std::cout << GridLogMessage << "Applied Moe" << std::endl; + Dc.Meooe(src_e, phi_o); std::cout << GridLogMessage << "Applied Meo" << std::endl; + + phi_o += chi_o; + phi_e += chi_e; + + setCheckerboard(phi, phi_e); + setCheckerboard(phi, phi_o); + + std::cout << GridLogDebug << "norm phi_e = " << norm2(phi_e) << " norm phi_o = " << norm2(phi_o) << " norm phi = " << norm2(phi) << std::endl; + + diff = ref - phi; + auto absDev = norm2(diff); + auto relDev = absDev / norm2(ref); + std::cout << GridLogMessage << "norm2(Dunprec), norm2(Deoprec), abs. deviation, rel. deviation: " + << norm2(ref) << " " << norm2(phi) << " " << absDev << " " << relDev + << " -> check " << ((relDev < checkTolerance) ? "passed" : "failed") << std::endl; + assert(relDev <= checkTolerance); + } + + { + std::cout << GridLogMessage << "===========================================================================" << std::endl; + std::cout << GridLogMessage << "= Test MpcDagMpc is hermitian" << std::endl; + std::cout << GridLogMessage << "===========================================================================" << std::endl; + + CoarseVector phi(Grid_c); random(pRNG_c, phi); + CoarseVector chi(Grid_c); random(pRNG_c, chi); + CoarseVector chi_e(RBGrid_c); chi_e = Zero(); + CoarseVector chi_o(RBGrid_c); chi_o = Zero(); + CoarseVector dchi_e(RBGrid_c); dchi_e = Zero(); + CoarseVector dchi_o(RBGrid_c); dchi_o = Zero(); + CoarseVector phi_e(RBGrid_c); phi_e = Zero(); + CoarseVector phi_o(RBGrid_c); phi_o = Zero(); + CoarseVector dphi_e(RBGrid_c); dphi_e = Zero(); + CoarseVector dphi_o(RBGrid_c); dphi_o = Zero(); + + pickCheckerboard(Even, chi_e, chi); + pickCheckerboard(Odd, chi_o, chi); + pickCheckerboard(Even, phi_e, phi); + pickCheckerboard(Odd, phi_o, phi); + + SchurDiagMooeeOperator HermOpEO(Dc); + + HermOpEO.MpcDagMpc(chi_e, dchi_e); std::cout << GridLogMessage << "Applied MpcDagMpc to chi_e" << std::endl; + HermOpEO.MpcDagMpc(chi_o, dchi_o); std::cout << GridLogMessage << "Applied MpcDagMpc to chi_o" << std::endl; + HermOpEO.MpcDagMpc(phi_e, dphi_e); std::cout << GridLogMessage << "Applied MpcDagMpc to phi_e" << std::endl; + HermOpEO.MpcDagMpc(phi_o, dphi_o); std::cout << GridLogMessage << "Applied MpcDagMpc to phi_o" << std::endl; + + ComplexD phiDchi_e = innerProduct(phi_e, dchi_e); + ComplexD phiDchi_o = innerProduct(phi_o, dchi_o); + ComplexD chiDphi_e = innerProduct(chi_e, dphi_e); + ComplexD chiDphi_o = innerProduct(chi_o, dphi_o); + + std::cout << GridLogMessage << "e " << phiDchi_e << " " << chiDphi_e << std::endl; + std::cout << GridLogMessage << "o " << phiDchi_o << " " << chiDphi_o << std::endl; + + std::cout << GridLogMessage << "phiDchi_e - conj(chiDphi_e) " << phiDchi_e - conj(chiDphi_e) << std::endl; + std::cout << GridLogMessage << "phiDchi_o - conj(chiDphi_o) " << phiDchi_o - conj(chiDphi_o) << std::endl; + } + + Grid_finalize(); +} diff --git a/tests/solver/Test_contfrac_cg.cc b/tests/solver/Test_contfrac_cg.cc index 36614dfa..afabae4c 100644 --- a/tests/solver/Test_contfrac_cg.cc +++ b/tests/solver/Test_contfrac_cg.cc @@ -94,7 +94,7 @@ int main (int argc, char ** argv) GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); RealD mass=0.1; diff --git a/tests/solver/Test_dwf_cg_prec.cc b/tests/solver/Test_dwf_cg_prec.cc index cb53894f..debb736a 100644 --- a/tests/solver/Test_dwf_cg_prec.cc +++ b/tests/solver/Test_dwf_cg_prec.cc @@ -67,7 +67,7 @@ int main(int argc, char** argv) { result = Zero(); LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << " Ls: " << Ls << std::endl; diff --git a/tests/solver/Test_dwf_cg_schur.cc b/tests/solver/Test_dwf_cg_schur.cc index 6216c366..6541e73d 100644 --- a/tests/solver/Test_dwf_cg_schur.cc +++ b/tests/solver/Test_dwf_cg_schur.cc @@ -61,7 +61,7 @@ int main (int argc, char ** argv) LatticeFermion src(FGrid); random(RNG5,src); LatticeFermion result(FGrid); result=Zero(); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu); + LatticeGaugeField Umu(UGrid); SU::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); for(int mu=0;mu::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); for(int mu=0;mu::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); diff --git a/tests/solver/Test_dwf_fpgcr.cc b/tests/solver/Test_dwf_fpgcr.cc index 156f678a..42cc8de1 100644 --- a/tests/solver/Test_dwf_fpgcr.cc +++ b/tests/solver/Test_dwf_fpgcr.cc @@ -68,7 +68,7 @@ int main (int argc, char ** argv) LatticeFermion result(FGrid); result=Zero(); LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4,Umu); + SU::HotConfiguration(RNG4,Umu); ConjugateResidual CR(1.0e-6,10000); diff --git a/tests/solver/Test_dwf_hdcr.cc b/tests/solver/Test_dwf_hdcr.cc index 8e083231..f68e99ab 100644 --- a/tests/solver/Test_dwf_hdcr.cc +++ b/tests/solver/Test_dwf_hdcr.cc @@ -222,9 +222,16 @@ int main (int argc, char ** argv) GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());; GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d); - GridCartesian *CoarseCoarse4d = SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());; + + + GridCartesian *CoarseCoarse4d = SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridCartesian *CoarseCoarse5d = SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d); + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d); + GridRedBlackCartesian *CoarseCoarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseCoarse4d); + GridRedBlackCartesian *CoarseCoarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,CoarseCoarse4d); + std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); std::vector cseeds({5,6,7,8}); @@ -282,8 +289,7 @@ int main (int argc, char ** argv) Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); - Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); - + Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); ////////////////////////////////////////////////// // Deflate the course space. Recursive multigrid? @@ -311,12 +317,11 @@ int main (int argc, char ** argv) } } - Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix + Level2Op L2Op(*CoarseCoarse5d,*CoarseCoarse5dRB,1); // Hermitian matrix typedef Level2Op::CoarseVector CoarseCoarseVector; HermitianLinearOperator L1LinOp(LDOp); L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates); - std::cout< +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class SolverWrapper : public LinearFunction { +private: + CheckerBoardedSparseMatrixBase & _Matrix; + SchurRedBlackBase & _Solver; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(CheckerBoardedSparseMatrixBase &Matrix, + SchurRedBlackBase &Solver) + : _Matrix(Matrix), _Solver(Solver) {}; + + void operator() (const Field &in, Field &out){ + + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +template +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + Matrix & _FineMatrix; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < block ({2,2,2,2}); + std::vector blockc ({2,2,2,2}); + const int nbasis= 32; + const int nbasisc= 32; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + std::string file("./ckpoint_lat.4000"); + //std::string file("./ckpoint_lat.1000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + LatticeFermion A(FGrid); + LatticeFermion B(FGrid); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.05,500,200,150,0.0);// + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,1000,100,100,0.0); // Slightly faster + + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + std::cout << " Making 5D coarse RB grid " <,nbasisc> CoarseSubspace; + // CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< PosdefLdop(LDOp); + typedef Level2Op::CoarseVector CoarseCoarseVector; + CoarseVector c_src(Coarse5d); c_src=1.0; + + std::cout< , SolverWrapper > TwoLevelMG; + typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + + ChebyshevSmoother FineSmoother(0.5,60.0,12,HermIndefOp,Ddwf); + std::cout< CoarseZeroGuesser; + ConjugateGradient CoarseCG(0.005,1000); + // SchurDiagMooeeOperator CoarseMpcDagMpc(LDOp); + SchurRedBlackDiagMooeeSolve CoarseRBCG(CoarseCG); + SolverWrapper CoarseSolver(LDOp,CoarseRBCG); + + // NormalEquations CoarseCGNE(LDOp,CoarseCG,CoarseZeroGuesser); + TwoLevelMG TwoLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + CoarseSolver); + TwoLevelPrecon.Level(1); + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + l1PGCR(src,result); + + std::cout< pCG(1.0e-8,60000); + result=Zero(); + // pCG(HermDefOp,src,result); + + std::cout< HermOpEO(Ddwf); + // pCG(HermOpEO,src_o,result_o); + + std::cout< PM; PM(HermDefOp,src); + std::cout< cPM; cPM(PosdefLdop,c_src); + // std::cout< ccPM; ccPM(IRLHermOpL2,cc_src); + + std::cout< +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +template +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + Matrix & _FineMatrix; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < block ({2,2,2,2}); + std::vector blockc ({2,2,2,2}); + const int nbasis= 40; + const int nbasisc= 40; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + // std::string file("./ckpoint_lat.4000"); + // std::string file("./ckpoint_lat.1000"); + // NerscIO::readConfiguration(Umu,header,file); + SU::HotConfiguration(RNG4,Umu); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + LatticeFermion A(FGrid); + LatticeFermion B(FGrid); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,400,50,50,0.0); // Slightly faster + + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + std::cout << " Making 5D coarse RB grid " <,nbasisc> CoarseSubspace; + // CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< PosdefLdop(LDOp); + /* + { + int nb=nbasisc/2; + CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,15.0,0.02,1000,800,100,0.0); + for(int n=0;noSites();site++){ + subspace_g5[site](nn) = subspace[site](nn); + subspace_g5[site](nn+nb)=-subspace[site](nn+nb); + } + } + } + } + */ + typedef Level2Op::CoarseVector CoarseCoarseVector; + /* + Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix + HermitianLinearOperator L1LinOp(LDOp); + L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates); + + + std::cout< IRLHermOpL2(L2Op); + CoarseCoarseVector cc_src(CoarseCoarse5d); cc_src=1.0; + */ + /* + Chebyshev IRLChebyL2(0.001,15.0,301); + FunctionHermOp IRLOpChebyL2(IRLChebyL2,IRLHermOpL2); + PlainHermOp IRLOpL2 (IRLHermOpL2); + int cNk=24; + int cNm=36; + int cNstop=24; + ImplicitlyRestartedLanczos IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20); + + int cNconv; + std::vector eval2(cNm); + std::vector evec2(cNm,CoarseCoarse5d); + IRLL2.calc(eval2,evec2,cc_src,cNconv); + + ConjugateGradient CoarseCoarseCG(0.1,1000); + DeflatedGuesser DeflCoarseCoarseGuesser(evec2,eval2); + NormalEquations DeflCoarseCoarseCGNE(L2Op,CoarseCoarseCG,DeflCoarseCoarseGuesser); + */ + + /* + std::cout< IRLHermOp(LDOp); + // Chebyshev IRLCheby(0.001,15.0,301); + Chebyshev IRLCheby(0.03,12.0,101); + FunctionHermOp IRLOpCheby(IRLCheby,IRLHermOp); + PlainHermOp IRLOp (IRLHermOp); + int Nk=64; + int Nm=128; + int Nstop=Nk; + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + IRL.calc(eval,evec,c_src,Nconv); + */ + CoarseVector c_src(Coarse5d); c_src=1.0; + // DeflatedGuesser DeflCoarseGuesser(evec,eval); + // NormalEquations DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser); + + std::cout< , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + + ChebyshevSmoother FineSmoother(0.25,60.0,12,HermIndefOp,Ddwf); + /* + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + ChebyshevSmoother CoarseSmoother(0.1,15.0,3,L1LinOp,LDOp); + + // MirsSmoother CoarseCGSmoother(0.1,0.1,4,L1LinOp,LDOp); + // MirsSmoother FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf); + + CoarseMG Level2Precon (CoarseAggregates, L2Op, + L1LinOp,LDOp, + CoarseSmoother, + DeflCoarseCoarseGuesser, + DeflCoarseCoarseCGNE); + Level2Precon.Level(2); + + // PGCR Applying this solver to solve the coarse space problem + PrecGeneralisedConjugateResidual l2PGCR(0.1, 100, L1LinOp,Level2Precon,16,16); + l2PGCR.Level(2); + + // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space + ZeroGuesser CoarseZeroGuesser; + ThreeLevelMG ThreeLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + l2PGCR); + ThreeLevelPrecon.Level(1); + + // Apply the fine-coarse-coarsecoarse 2 deep MG preconditioner in an outer PGCR on the fine fgrid + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,1000,HermIndefOp,ThreeLevelPrecon,16,16); + l1PGCR.Level(1); + */ + std::cout< CoarseZeroGuesser; + ConjugateGradient CoarseCG(0.01,1000); + NormalEquations CoarseCGNE(LDOp,CoarseCG,CoarseZeroGuesser); + TwoLevelMG TwoLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + CoarseCGNE); + TwoLevelPrecon.Level(1); + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + l1PGCR(src,result); + + std::cout< pCG(1.0e-8,60000); + result=Zero(); + // pCG(HermDefOp,src,result); + + std::cout< HermOpEO(Ddwf); + // pCG(HermOpEO,src_o,result_o); + + std::cout< PM; PM(HermDefOp,src); + std::cout< cPM; cPM(PosdefLdop,c_src); + // std::cout< ccPM; ccPM(IRLHermOpL2,cc_src); + + std::cout< seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); @@ -328,7 +330,7 @@ int main (int argc, char ** argv) Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); - Level1Op LDOp(*Coarse5d,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); + Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); std::cout< CoarseCG(0.01,1000); - ConjugateGradient CoarseCG(0.02,1000);// 14.7s + ConjugateGradient CoarseCG(0.01,2000);// 14.7s + eval.resize(0); + evec.resize(0,Coarse5d); DeflatedGuesser DeflCoarseGuesser(evec,eval); NormalEquations DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser); diff --git a/tests/solver/Test_dwf_hdcr_48_rb.cc b/tests/solver/Test_dwf_hdcr_48_rb.cc new file mode 100644 index 00000000..a4d7bbb9 --- /dev/null +++ b/tests/solver/Test_dwf_hdcr_48_rb.cc @@ -0,0 +1,397 @@ +/************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_dwf_hdcr.cc + + Copyright (C) 2015 + +Author: Antonin Portelli +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class SolverWrapper : public LinearFunction { +private: + CheckerBoardedSparseMatrixBase & _Matrix; + SchurRedBlackBase & _Solver; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(CheckerBoardedSparseMatrixBase &Matrix, + SchurRedBlackBase &Solver) + : _Matrix(Matrix), _Solver(Solver) {}; + + void operator() (const Field &in, Field &out){ + + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +template +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + Matrix & _FineMatrix; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < block ({2,2,2,2}); + //std::vector block ({2,2,2,2}); + const int nbasis= 40; + const int nbasisc= 40; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + //std::string file("./ckpoint_lat.4000"); + std::string file("./ckpoint_lat.1000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + LatticeFermion A(FGrid); + LatticeFermion B(FGrid); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,1000,100,100,0.0); // Slightly faster + + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d); + + Level1Op LDOp(*Coarse5d,*Coarse5dRB,1); LDOp.CoarsenOperator(FGrid,HermIndefOp,Aggregates); + + ////////////////////////////////////////////////// + // Deflate the course space. Recursive multigrid? + ////////////////////////////////////////////////// + typedef Aggregation,nbasisc> CoarseSubspace; + // CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< PosdefLdop(LDOp); + typedef Level2Op::CoarseVector CoarseCoarseVector; + CoarseVector c_src(Coarse5d); c_src=1.0; + + std::cout< , SolverWrapper > TwoLevelMG; + typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + + std::cout< tols({0.015}); + std::vector ords({12}); + std::vector los({0.8}); + for(int l=0;l FineSmoother(los[l],60.0,ords[o],HermIndefOp,Ddwf); + ZeroGuesser CoarseZeroGuesser; + ConjugateGradient CoarseCG(tols[t],10000); + SchurRedBlackDiagMooeeSolve CoarseRBCG(CoarseCG); + SolverWrapper CoarseSolver(LDOp,CoarseRBCG); + + TwoLevelMG TwoLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + CoarseSolver); + TwoLevelPrecon.Level(1); + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + l1PGCR(src,result); + }}} + + ConjugateGradient pCG(1.0e-8,60000); + std::cout< HermOpEO(Ddwf); + pCG(HermOpEO,src_o,result_o); + + std::cout< PM; PM(HermDefOp,src); + std::cout< cPM; cPM(PosdefLdop,c_src); + // std::cout< ccPM; ccPM(IRLHermOpL2,cc_src); + + std::cout< +Author: Peter Boyle +Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace std; +using namespace Grid; +/* Params + * Grid: + * block1(4) + * block2(4) + * + * Subspace + * * Fine : Subspace(nbasis,hi,lo,order,first,step) -- 32, 60,0.02,500,100,100 + * * Coarse: Subspace(nbasis,hi,lo,order,first,step) -- 32, 18,0.02,500,100,100 + + * Smoother: + * * Fine: Cheby(hi, lo, order) -- 60,0.5,10 + * * Coarse: Cheby(hi, lo, order) -- 12,0.1,4 + + * Lanczos: + * CoarseCoarse IRL( Nk, Nm, Nstop, poly(lo,hi,order)) 24,36,24,0.002,4.0,61 + */ +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template class MirsSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & SmootherMatrix; + FineOperator & SmootherOperator; + RealD tol; + RealD shift; + int maxit; + + MirsSmoother(RealD _shift,RealD _tol,int _maxit,FineOperator &_SmootherOperator,Matrix &_SmootherMatrix) : + shift(_shift),tol(_tol),maxit(_maxit), + SmootherOperator(_SmootherOperator), + SmootherMatrix(_SmootherMatrix) + {}; + + void operator() (const Field &in, Field &out) + { + ZeroGuesser Guess; + ConjugateGradient CG(tol,maxit,false); + + Field src(in.Grid()); + + ShiftedMdagMLinearOperator,Field> MdagMOp(SmootherMatrix,shift); + SmootherOperator.AdjOp(in,src); + Guess(src,out); + CG(MdagMOp,src,out); + } +}; + +template +class MultiGridPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef CoarsenedMatrix CoarseOperator; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + + Aggregates & _Aggregates; + CoarseOperator & _CoarseOperator; + Matrix & _FineMatrix; + FineOperator & _FineOperator; + Guesser & _Guess; + FineSmoother & _Smoother; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + +#define GridLogLevel std::cout << GridLogMessage < block ({2,2,2,2}); + std::vector blockc ({2,2,2,2}); + const int nbasis= 40; + const int nbasisc= 40; + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + std::vector cseeds({5,6,7,8}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); + GridParallelRNG CRNG(Coarse5d);CRNG.SeedFixedIntegers(cseeds); + LatticeFermion src(FGrid); gaussian(RNG5,src);// src=src+g5*src; + LatticeFermion result(FGrid); + LatticeGaugeField Umu(UGrid); + + FieldMetaData header; + // std::string file("./ckpoint_lat.4000"); + std::string file("./ckpoint_lat.1000"); + NerscIO::readConfiguration(Umu,header,file); + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + std::cout< HermDefOp(Ddwf); + + Subspace Aggregates(Coarse5d,FGrid,0); + + assert ( (nbasis & 0x1)==0); + { + int nb=nbasis/2; + LatticeFermion A(FGrid); + LatticeFermion B(FGrid); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.002,1000,800,100,0.0); + // Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.02,1000,800,100,0.0); + Aggregates.CreateSubspaceChebyshev(RNG5,HermDefOp,nb,60.0,0.01,1000,100,100,0.0); // Slightly faster + + for(int n=0;n Level1Op; + typedef CoarsenedMatrix,nbasisc> Level2Op; + + Gamma5R5HermitianLinearOperator HermIndefOp(Ddwf); + + + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + std::cout << " Making 5D coarse RB grid " <,nbasisc> CoarseSubspace; + // CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< PosdefLdop(LDOp); + /* + { + int nb=nbasisc/2; + CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,15.0,0.02,1000,800,100,0.0); + for(int n=0;noSites();site++){ + subspace_g5[site](nn) = subspace[site](nn); + subspace_g5[site](nn+nb)=-subspace[site](nn+nb); + } + } + } + } + */ + typedef Level2Op::CoarseVector CoarseCoarseVector; + /* + Level2Op L2Op(*CoarseCoarse5d,1); // Hermitian matrix + HermitianLinearOperator L1LinOp(LDOp); + L2Op.CoarsenOperator(Coarse5d,L1LinOp,CoarseAggregates); + + + std::cout< IRLHermOpL2(L2Op); + CoarseCoarseVector cc_src(CoarseCoarse5d); cc_src=1.0; + */ + /* + Chebyshev IRLChebyL2(0.001,15.0,301); + FunctionHermOp IRLOpChebyL2(IRLChebyL2,IRLHermOpL2); + PlainHermOp IRLOpL2 (IRLHermOpL2); + int cNk=24; + int cNm=36; + int cNstop=24; + ImplicitlyRestartedLanczos IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20); + + int cNconv; + std::vector eval2(cNm); + std::vector evec2(cNm,CoarseCoarse5d); + IRLL2.calc(eval2,evec2,cc_src,cNconv); + + ConjugateGradient CoarseCoarseCG(0.1,1000); + DeflatedGuesser DeflCoarseCoarseGuesser(evec2,eval2); + NormalEquations DeflCoarseCoarseCGNE(L2Op,CoarseCoarseCG,DeflCoarseCoarseGuesser); + */ + + /* + std::cout< IRLHermOp(LDOp); + // Chebyshev IRLCheby(0.001,15.0,301); + Chebyshev IRLCheby(0.03,12.0,101); + FunctionHermOp IRLOpCheby(IRLCheby,IRLHermOp); + PlainHermOp IRLOp (IRLHermOp); + int Nk=64; + int Nm=128; + int Nstop=Nk; + ImplicitlyRestartedLanczos IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20); + + int Nconv; + std::vector eval(Nm); + std::vector evec(Nm,Coarse5d); + IRL.calc(eval,evec,c_src,Nconv); + */ + CoarseVector c_src(Coarse5d); c_src=1.0; + // DeflatedGuesser DeflCoarseGuesser(evec,eval); + // NormalEquations DeflCoarseCGNE(LDOp,CoarseCG,DeflCoarseGuesser); + + std::cout< , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner , NormalEquations > TwoLevelMG; + typedef MultiGridPreconditioner,nbasisc,Level1Op, DeflatedGuesser, NormalEquations > CoarseMG; + typedef MultiGridPreconditioner, LinearFunction > ThreeLevelMG; + + ChebyshevSmoother FineSmoother(0.25,60.0,12,HermIndefOp,Ddwf); + /* + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + ChebyshevSmoother CoarseSmoother(0.1,15.0,3,L1LinOp,LDOp); + + // MirsSmoother CoarseCGSmoother(0.1,0.1,4,L1LinOp,LDOp); + // MirsSmoother FineCGSmoother(0.0,0.01,8,HermIndefOp,Ddwf); + + CoarseMG Level2Precon (CoarseAggregates, L2Op, + L1LinOp,LDOp, + CoarseSmoother, + DeflCoarseCoarseGuesser, + DeflCoarseCoarseCGNE); + Level2Precon.Level(2); + + // PGCR Applying this solver to solve the coarse space problem + PrecGeneralisedConjugateResidual l2PGCR(0.1, 100, L1LinOp,Level2Precon,16,16); + l2PGCR.Level(2); + + // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space + ZeroGuesser CoarseZeroGuesser; + ThreeLevelMG ThreeLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + l2PGCR); + ThreeLevelPrecon.Level(1); + + // Apply the fine-coarse-coarsecoarse 2 deep MG preconditioner in an outer PGCR on the fine fgrid + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,1000,HermIndefOp,ThreeLevelPrecon,16,16); + l1PGCR.Level(1); + */ + std::cout< CoarseZeroGuesser; + ConjugateGradient CoarseCG(0.01,1000); + NormalEquations CoarseCGNE(LDOp,CoarseCG,CoarseZeroGuesser); + TwoLevelMG TwoLevelPrecon(Aggregates, LDOp, + HermIndefOp,Ddwf, + FineSmoother, + CoarseZeroGuesser, + CoarseCGNE); + TwoLevelPrecon.Level(1); + PrecGeneralisedConjugateResidual l1PGCR(1.0e-8,20,HermIndefOp,TwoLevelPrecon,16,16); + l1PGCR.Level(1); + l1PGCR(src,result); + + std::cout< pCG(1.0e-8,60000); + result=Zero(); + // pCG(HermDefOp,src,result); + + std::cout< HermOpEO(Ddwf); + pCG(HermOpEO,src_o,result_o); + + std::cout< PM; PM(HermDefOp,src); + std::cout< cPM; cPM(PosdefLdop,c_src); + // std::cout< ccPM; ccPM(IRLHermOpL2,cc_src); + + std::cout<::HotConfiguration(pRNG,Umu); /////////////////////////////////////////////////////////////// // Bounce these fields to disk diff --git a/tests/solver/Test_dwf_mrhs_cg_mpi.cc b/tests/solver/Test_dwf_mrhs_cg_mpi.cc index 8ace9b43..d0a32460 100644 --- a/tests/solver/Test_dwf_mrhs_cg_mpi.cc +++ b/tests/solver/Test_dwf_mrhs_cg_mpi.cc @@ -136,11 +136,11 @@ int main (int argc, char ** argv) std::cout << GridLogMessage << "Intialising 4D RNG "<::HotConfiguration(pRNG,Umu); std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<::ColdConfiguration(Umu); std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<::HotConfiguration(pRNG,Umu); ///////////////// // MPI only sends diff --git a/tests/solver/Test_dwf_multigrid.cc b/tests/solver/Test_dwf_multigrid.cc index 9e11c160..351e10fd 100644 --- a/tests/solver/Test_dwf_multigrid.cc +++ b/tests/solver/Test_dwf_multigrid.cc @@ -370,6 +370,11 @@ int main (int argc, char ** argv) GridCartesian *CoarseCoarse4d = SpaceTimeGrid::makeFourDimGrid(cclatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());; GridCartesian *CoarseCoarse5d = SpaceTimeGrid::makeFiveDimGrid(1,CoarseCoarse4d); + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d); + GridRedBlackCartesian *CoarseCoarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseCoarse4d); + GridRedBlackCartesian *CoarseCoarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,CoarseCoarse4d); + std::vector seeds4({1,2,3,4}); std::vector seeds5({5,6,7,8}); std::vector cseeds({5,6,7,8}); @@ -434,8 +439,8 @@ int main (int argc, char ** argv) std::cout<::HotConfiguration(RNG4,Umu); std::vector U(4,UGrid); diff --git a/tests/solver/Test_hw_multigrid.cc b/tests/solver/Test_hw_multigrid.cc index b728faa7..66c88883 100644 --- a/tests/solver/Test_hw_multigrid.cc +++ b/tests/solver/Test_hw_multigrid.cc @@ -274,6 +274,8 @@ int main (int argc, char ** argv) GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());; GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(Ls,Coarse4d); + GridRedBlackCartesian * Coarse4dRB = SpaceTimeGrid::makeFourDimRedBlackGrid(Coarse4d); + GridRedBlackCartesian * Coarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,Coarse4d); std::vector seeds({1,2,3,4}); GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds); @@ -335,7 +337,7 @@ int main (int argc, char ** argv) NonHermitianLinearOperator LinOpDwf(Ddwf); - Level1Op LDOp (*Coarse5d,0); + Level1Op LDOp (*Coarse5d,*Coarse5dRB,0); std::cout< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +using namespace std; +using namespace Grid; + +// TODO +// +// Coarse Grid axpby_ssp_pminus // Inherit from spProj5pm +// Coarse Grid axpby_ssp_pplus + +template +class CayleyBase : public SparseMatrixBase +{ +public: + int Ls; + // protected: + RealD mass; + RealD M5; + // Save arguments to SetCoefficientsInternal + Vector _gamma; + RealD _zolo_hi; + RealD _b; + RealD _c; + + // Cayley form Moebius (tanh and zolotarev) + Vector omega; + Vector bs; // S dependent coeffs + Vector cs; + Vector as; + // For preconditioning Cayley form + Vector bee; + Vector cee; + Vector aee; + Vector beo; + Vector ceo; + Vector aeo; + // LDU factorisation of the eeoo matrix + Vector lee; + Vector leem; + Vector uee; + Vector ueem; + Vector dee; +public: + CayleyBase(RealD _M5, RealD _mass, int _Ls, RealD b_, RealD c_) : + M5(_M5), + mass(_mass), + Ls(_Ls), + _b(b_), + _c(c_) + { + RealD eps = 1.0; + Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham + this->SetCoefficientsTanh(zdata,1.0,0.0); + Approx::zolotarev_free(zdata); + } + ///////////////////////////////////////////////////////// + // Replicates functionality + // Use a common base class approach + ///////////////////////////////////////////////////////// + // Tanh + void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c) + { + Vector gamma(this->Ls); + for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; + SetCoefficientsInternal(1.0,gamma,b,c); + } + //Zolo + void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c) + { + Vector gamma(this->Ls); + for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; + SetCoefficientsInternal(zolo_hi,gamma,b,c); + } + //Zolo + void SetCoefficientsInternal(RealD zolo_hi,Vector & gamma,RealD b,RealD c) + { + int Ls=this->Ls; + + /////////////////////////////////////////////////////////// + // The Cayley coeffs (unprec) + /////////////////////////////////////////////////////////// + assert(gamma.size()==Ls); + + omega.resize(Ls); + bs.resize(Ls); + cs.resize(Ls); + as.resize(Ls); + + double bpc = b+c; + double bmc = b-c; + _b = b; + _c = c; + _gamma = gamma; // Save the parameters so we can change mass later. + _zolo_hi= zolo_hi; + for(int i=0; i < Ls; i++){ + as[i] = 1.0; + omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code + assert(omega[i]!=Coeff_t(0.0)); + bs[i] = 0.5*(bpc/omega[i] + bmc); + cs[i] = 0.5*(bpc/omega[i] - bmc); + } + + //////////////////////////////////////////////////////// + // Constants for the preconditioned matrix Cayley form + //////////////////////////////////////////////////////// + bee.resize(Ls); + cee.resize(Ls); + beo.resize(Ls); + ceo.resize(Ls); + + for(int i=0;iM5) +1.0); + assert(bee[i]!=Coeff_t(0.0)); + cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); + beo[i]=as[i]*bs[i]; + ceo[i]=-as[i]*cs[i]; + } + aee.resize(Ls); + aeo.resize(Ls); + for(int i=0;i &out){assert(0);}; + virtual void DW (const Field &psi, Field &chi)=0; + virtual void DWDag (const Field &psi, Field &chi)=0; + + void M (const Field &psi, Field &chi) + { + Field Din(psi.Grid()); + Meooe5D(psi,Din); + DW(Din,chi); + axpby(chi,1.0,1.0,chi,psi); + M5D(psi,chi); + } + void Mdag (const Field &psi, Field &chi) + { + Field Din(psi.Grid()); + DWDag(psi,Din); + MeooeDag5D(Din,chi); + M5Ddag(psi,chi); + axpby (chi,1.0,1.0,chi,psi); + } + ///////////////////////////////// + // P and Pdag - might be needed + ///////////////////////////////// + void P(const Field &psi, Field &chi) + { + int Ls= this->Ls; + chi=Zero(); + for(int s=0;sLs; + chi=Zero(); + for(int s=0;sLs; + Vector diag (Ls,1.0); + Vector upper(Ls,-1.0); upper[Ls-1]=mass; + Vector lower(Ls,-1.0); lower[0] =mass; + M5D(psi,chi,chi,lower,diag,upper); + } + void M5Ddag (const Field &psi, Field &chi) + { + int Ls=this->Ls; + Vector diag(Ls,1.0); + Vector upper(Ls,-1.0); + Vector lower(Ls,-1.0); + upper[Ls-1]=-mass*upper[Ls-1]; + lower[0] =-mass*lower[0]; + M5Ddag(psi,chi,chi,lower,diag,upper); + } + void Meooe5D (const Field &psi, Field &Din) + { + int Ls=this->Ls; + Vector diag = bs; + Vector upper= cs; + Vector lower= cs; + upper[Ls-1]=-mass*upper[Ls-1]; + lower[0] =-mass*lower[0]; + M5D(psi,psi,Din,lower,diag,upper); + } + void MeooeDag5D (const Field &psi, Field &Din) + { + int Ls=this->Ls; + Vector diag =bs; + Vector upper=cs; + Vector lower=cs; + + for (int s=0;s &lower, + Vector &diag, + Vector &upper) + { + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); + assert(phi.Checkerboard() == psi.Checkerboard()); + + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + + int Ls =this->Ls; + + // 10 = 3 complex mult + 2 complex add + // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) + uint64_t nloop = grid->oSites()/Ls; + + const int Nsimd = Field::vector_type::Nsimd(); + accelerator_for(sss,nloop,Nsimd,{ + uint64_t ss= sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp1, tmp2; + for(int s=0;s &lower, + Vector &diag, + Vector &upper) + { + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); + assert(phi.Checkerboard() == psi.Checkerboard()); + + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + + int Ls=this->Ls; + + uint64_t nloop = grid->oSites()/Ls; + const int Nsimd = Field::vector_type::Nsimd(); + accelerator_for(sss,nloop,Nsimd,{ + uint64_t ss=sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp1,tmp2; + for(int s=0;s +class CoarseCayleyFermion : public CayleyBase< Lattice > , ComplexD > +{ +public: + typedef iVector siteVector; + typedef Lattice CoarseComplexField; + typedef Lattice CoarseVector; + typedef Lattice > CoarseMatrix; + typedef iMatrix Cobj; + typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + // Similar to the CoarseOperator but add 5D support. + Geometry geom; + GridBase *Coarse5D; + GridBase *Coarse4D; + CartesianStencil Stencil; + CoarsenedMatrix &Dw; + + GridBase * Grid(void) { return Coarse5D; }; // this is all the linalg routines need to know + + CoarseCayleyFermion(GridCartesian &CoarseGrid4, + GridCartesian &CoarseGrid5, + CoarsenedMatrix &_Dw, + RealD M5, RealD mass, int Ls, RealD b, RealD c) : + CayleyBase(M5,mass,Ls,b,c), + Coarse4D(&CoarseGrid4), + Coarse5D(&CoarseGrid5), + Dw(_Dw), + geom(CoarseGrid5._ndimension), + Stencil( &CoarseGrid5,geom.npoint,Even,geom.directions,geom.displacements,0) + { + }; + +public: + void Project( CoarseVector &C ) + { + const int Nsimd = CComplex::Nsimd(); + autoView(Cv,C, AcceleratorWrite); + int Ls = this->Ls; + for(int s=0;soSites(), Nsimd, { + int sF= sU*Ls+s; + auto tmp = coalescedRead(Cv[sF]); + coalescedWrite(Cv[sF],tmp); + }); + } + } + //////////////////////////////////////////////// + // This is specific to Coarse Grid Cayley + //////////////////////////////////////////////// + virtual void Mdiag (const CoarseVector &in, CoarseVector &out) + { + std::vector allout(9,in.Grid()); + this->MdirAll(in,allout); + out = allout[8]; + } + virtual void Mdir (const CoarseVector &in, CoarseVector &out,int dir, int disp) + { + assert(0); + } + virtual void MdirAll (const CoarseVector &in, std::vector &out) + { + conformable(Coarse5D,in.Grid()); + + SimpleCompressor compressor; + + Stencil.HaloExchange(in,compressor); + typedef LatticeView Aview; + + const int Nsimd = CComplex::Nsimd(); + + // Ls loop for2D + int Ls=this->Ls; + + siteVector *CBp=Stencil.CommBuf(); + + int ptype; + int nb2=nbasis/2; + + autoView(in_v , in, AcceleratorRead); + autoView(st, Stencil, AcceleratorRead); + for(int point=0;pointoSites(), b, nbasis, Nsimd, { + + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + int sU = sF/Ls; + int s = sF%Ls; + + calcComplex res = Zero(); + calcVector nbr; + int ptype; + + StencilEntry *SE=st.GetEntry(ptype,point,sF); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(CBp[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb compressor; + + Stencil.HaloExchange(in,compressor); + typedef LatticeView Aview; + + const int Nsimd = CComplex::Nsimd(); + + // Ls loop for2D + int Ls=this->Ls; + + Vector AcceleratorViewContainer; + for(int p=0;poSites(), b, nbasis, Nsimd, { + + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + int sU = sF/Ls; + int s = sF%Ls; + + calcComplex res = Zero(); + + { + calcVector nbr; + int ptype; + + for(int point=0;point_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(CBp[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb Aggregates; + + void PromoteFromSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) + { + auto FineGrid4 = _Aggregates.FineGrid; + FineField F4(FineGrid4); + CoarseVector C4(Coarse4D); + for(int s=0;sLs;s++){ + ExtractSlice(C4,C,s,0); + _Aggregates.PromoteFromSubspace(C4,F4); + InsertSlice(F4,F,s,0); + } + } + void ProjectToSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) + { + auto FineGrid4 = _Aggregates.FineGrid; + FineField F4(FineGrid4); + CoarseVector C4(Coarse4D); + for(int s=0;sLs;s++){ + ExtractSlice(F4,F,s,0); + _Aggregates.ProjectToSubspace (C4,F4); + InsertSlice(C4,C,s,0); + } + Project(C); + } + template + void Test(Aggregates &_Aggregates,GridBase *FineGrid, Ddwf &_Ddwf) + { + typedef Lattice FineField; + CoarseVector Cin(Coarse5D); + CoarseVector Cout(Coarse5D); + CoarseVector CFout(Coarse5D); + + FineField Fin(FineGrid); + FineField Fout(FineGrid); + + + std::vector seeds({1,2,3,4,5}); + GridParallelRNG RNG(Coarse5D); RNG.SeedFixedIntegers(seeds); + + gaussian(RNG,Cin); + PromoteFromSubspace(_Aggregates,Cin,Fin); + ProjectToSubspace(_Aggregates,Cin,Fin); + + std::cout << GridLogMessage<< "************ "<M(Cin,Cout); + this->Project(Cout); + std::cout << GridLogMessage<< " Cout "<Mdag(Cin,Cout); + this->Project(Cout); + std::cout << GridLogMessage<< " Cout "< Directions(void) { return geom.directions;}; + virtual std::vector Displacements(void){ return geom.displacements;}; +}; + + +template class SolverWrapper : public LinearFunction { +private: + LinearOperatorBase & _Matrix; + OperatorFunction & _Solver; + LinearFunction & _Guess; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(LinearOperatorBase &Matrix, + OperatorFunction &Solver, + LinearFunction &Guess) + : _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; + + void operator() (const Field &in, Field &out){ + + _Guess(in,out); + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +// Must use a non-hermitian solver +template +class PVdagMLinearOperator : public LinearOperatorBase { + Matrix &_Mat; + Matrix &_PV; +public: + PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){}; + + virtual std::vector Directions(void) { return _Mat.Directions();}; + virtual std::vector Displacements(void){ return _Mat.Displacements();}; + + void OpDiag (const Field &in, Field &out) { + assert(0); + } + void OpDir (const Field &in, Field &out,int dir,int disp) { + assert(0); + } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; + void Op (const Field &in, Field &out){ + Field tmp(in.Grid()); + _Mat.M(in,tmp); + _PV.Mdag(tmp,out); + } + void AdjOp (const Field &in, Field &out){ + Field tmp(in.Grid()); + _PV.M(tmp,out); + _Mat.Mdag(in,tmp); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + assert(0); + } + void HermOp(const Field &in, Field &out){ + assert(0); + } +}; + +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template +class MGPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + typedef CoarseCayleyFermion CoarseOperator; + // typedef SparseMatrixBase CoarseOperator; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _PreSmoother; + FineSmoother & _PostSmoother; + CoarseOperator & _CoarseOperator; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + MGPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &PreSmoother, + FineSmoother &PostSmoother, + CoarseOperator &CoarseOperator_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _PreSmoother(PreSmoother), + _PostSmoother(PostSmoother), + _CoarseOperator(CoarseOperator_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _CoarseOperator.Grid(); + CoarseVector Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + std::cout< +class HDCRPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + //typedef CoarseCayleyFermion CoarseOperator; + typedef SparseMatrixBase CoarseOperator; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _PreSmoother; + FineSmoother & _PostSmoother; + CoarseOperator & _CoarseOperator; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + HDCRPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &PreSmoother, + FineSmoother &PostSmoother, + CoarseOperator &CoarseOperator_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _PreSmoother(PreSmoother), + _PostSmoother(PostSmoother), + _CoarseOperator(CoarseOperator_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _CoarseOperator.Grid(); + CoarseVector Csrc(CoarseGrid); + CoarseVector g5Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + std::cout< block ({2,2,2,2}); // 4,2,2,2 gets worse + std::vector blockc ({1,1,1,1}); + const int nbasis= 24; + const int nbasisc= 32; // decrease, not improvement + + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds({1,2,3,4}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds); + GridParallelRNG CRNG(Coarse4d);CRNG.SeedFixedIntegers(seeds); + + LatticeGaugeField Umu(UGrid); +#if 0 + SU3::TepidConfiguration(RNG4,Umu); + RealD M5=1.0; +#else + std::string file("./ckpoint_lat.1000"); + FieldMetaData header; + NerscIO::readConfiguration(Umu,header,file); + RealD M5=1.8; +#endif + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + + std::cout< MdagM_Dw(Dw_null); + + std::cout< WilsonCG(1.0e-10,40000); + LatticeFermion w_src(UGrid); w_src=1.0; + LatticeFermion w_res(UGrid); + WilsonCG(MdagM_Dw,w_src,w_res); + exit(0); + */ + std::cout< Level1Op4; + typedef CoarseCayleyFermion Level1Op5; + Level1Op4 c_Dw (*Coarse4d,0); + NonHermitianLinearOperator LinOpDw(Dw); + c_Dw.CoarsenOperator(UGrid,LinOpDw,Aggregates4D); // contains the M5 from Dw(-M5) + // c_Dw.Test(Aggregates4D,UGrid,LinOpDw); + + std::cout< MdagM_cDwf(c_Dwf); + + std::cout<,nbasisc> Level2Op; + typedef Aggregation,nbasisc> CoarseSubspace; + CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< L1Hdwf(c_Dwf); + GridRedBlackCartesian * CoarseCoarse5dRB = SpaceTimeGrid::makeFiveDimRedBlackGrid(1,CoarseCoarse4d); + Level2Op cc_Dwf (*CoarseCoarse5d,*CoarseCoarse5dRB,1); // say it is hermitian + cc_Dwf.CoarsenOperator(Coarse5d,L1Hdwf,CoarseAggregates); + // cc_Dwf.Test(CoarseAggregates,Coarse5d,L1Hdwf); + + typedef Level2Op::CoarseVector CoarseCoarseVector; + + std::cout< CoarseCG(tol,MaxIt); + ConjugateGradient FineCG(tol,MaxIt); + + NonHermitianLinearOperator FineM(Ddwf); + MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M + + NonHermitianLinearOperator CoarseM(c_Dwf); + MdagMLinearOperator CoarseMdagM(c_Dwf); + + NonHermitianLinearOperator CoarseCoarseM(cc_Dwf); + MdagMLinearOperator CoarseCoarseMdagM(cc_Dwf); + + + std::cout< PM; PM(MdagM_Dw,w_src); + std::cout< cPM; cPM(CoarseMdagM,c_src); + + cc_src=1.0; + PowerMethod ccPM; ccPM(CoarseCoarseMdagM,cc_src); + + std::cout< IRLHermOpL2(cc_Dwf); + Chebyshev IRLChebyL2(IRL_lo,IRL_hi,IRL_ord); + FunctionHermOp IRLOpChebyL2(IRLChebyL2,IRLHermOpL2); + PlainHermOp IRLOpL2 (IRLHermOpL2); + ImplicitlyRestartedLanczos IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20); + + int cNconv; + cNm=0; + std::vector eval2(cNm); + std::vector evec2(cNm,CoarseCoarse5d); + cc_src=1.0; + // IRLL2.calc(eval2,evec2,cc_src,cNconv); + + ConjugateGradient CoarseCoarseCG(0.02,10000); + DeflatedGuesser DeflCoarseCoarseGuesser(evec2,eval2); + NormalEquations DeflCoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,DeflCoarseCoarseGuesser); + + ZeroGuesser CoarseZeroGuesser; + ZeroGuesser CoarseCoarseZeroGuesser; + + std::cout< CoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,CoarseCoarseZeroGuesser); + { +typedef HDCRPreconditioner,nbasisc,NormalEquations > CoarseMG; + typedef MGPreconditioner > ThreeLevelMG; + + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + ChebyshevSmoother CoarseSmoother1(0.5,22.0,12,CoarseM,c_Dwf); // 37s, 26 iter + ChebyshevSmoother CoarseSmoother2(0.5,22.0,12,CoarseM,c_Dwf); + + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,7,CoarseM,c_Dwf); // 38s, 26 iter + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,7,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.4,22.0,7,CoarseM,c_Dwf); // 41s, 27 iter + // ChebyshevSmoother CoarseSmoother2(0.4,22.0,7,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.6,22.0,6,CoarseM,c_Dwf); // 26 iter + // ChebyshevSmoother CoarseSmoother2(0.6,22.0,6,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,5,CoarseM,c_Dwf); // 33 iter, 55s + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,5,CoarseM,c_Dwf); + + + CoarseMG Level2Precon (CoarseAggregates, + CoarseM, + CoarseSmoother1, + CoarseSmoother2, + cc_Dwf, + DeflCoarseCoarseCGNE); + Level2Precon.Level(2); + + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.5, 100, CoarseM,Level2Precon,16,16); // 26 iter, 37s + // PGCR Applying this solver to solve the coarse space problem + // COULD BE FIXED??? + PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2); + + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(1.0, 100, CoarseM,Level2Precon,16,16); // 35 iter, 45s + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.6, 100, CoarseM,Level2Precon,16,16); // 26,38 (diifferene is measurement noise) + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.2, 100, CoarseM,Level2Precon,16,16); // 26 iter, 47s + L2PGCR.Level(2); + + // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space + + // ChebyshevSmoother FineSmoother1(0.5,60.0,14,FineM,Ddwf); // 26 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,14,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 38s + // ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 23 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,20,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,10,FineM,Ddwf);24 iter, 44s + // ChebyshevSmoother FineSmoother2(0.5,60.0,24,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // odd convergence tail at 10^-9 ish + // ChebyshevSmoother FineSmoother2(0.1,60.0,24,FineM,Ddwf); // 33 iter, waas O(10-9 by 26) + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,18,FineM,Ddwf); // + + ChebyshevSmoother FineSmoother1(0.5,60.0,16,FineM,Ddwf); + ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); // + + // ChebyshevSmoother FineSmoother1(0.5,60.0,11,FineM,Ddwf); // 33 iter, 49s + // ChebyshevSmoother FineSmoother2(0.5,60.0,11,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 26 iter, 37s + // ChebyshevSmoother FineSmoother2(0.5,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.4,60.0,12,FineM,Ddwf); // iter 26 no change in final residual + // ChebyshevSmoother FineSmoother2(0.4,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,12,FineM,Ddwf); // 27 iter 39s. + // ChebyshevSmoother FineSmoother2(0.3,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,13,FineM,Ddwf); // 26 iter, but slower + // ChebyshevSmoother FineSmoother2(0.3,60.0,13,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(1.0,60.0,12,FineM,Ddwf); // 34 iter, slower + // ChebyshevSmoother FineSmoother2(1.0,60.0,12,FineM,Ddwf); + + ThreeLevelMG ThreeLevelPrecon(Aggregates4D, + FineM, + FineSmoother1, + FineSmoother2, + c_Dwf, + L2PGCR); + ThreeLevelPrecon.Level(1); + + PrecGeneralisedConjugateResidualNonHermitian L1PGCR(1.0e-8,1000,FineM,ThreeLevelPrecon,16,16); + L1PGCR.Level(1); + + f_res=Zero(); + L1PGCR(f_src,f_res); + } + + std::cout< + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include +#include + +using namespace std; +using namespace Grid; + +// TODO +// +// Coarse Grid axpby_ssp_pminus // Inherit from spProj5pm +// Coarse Grid axpby_ssp_pplus + +template +class CayleyBase : public SparseMatrixBase +{ +public: + int Ls; + // protected: + RealD mass; + RealD M5; + // Save arguments to SetCoefficientsInternal + Vector _gamma; + RealD _zolo_hi; + RealD _b; + RealD _c; + + // Cayley form Moebius (tanh and zolotarev) + Vector omega; + Vector bs; // S dependent coeffs + Vector cs; + Vector as; + // For preconditioning Cayley form + Vector bee; + Vector cee; + Vector aee; + Vector beo; + Vector ceo; + Vector aeo; + // LDU factorisation of the eeoo matrix + Vector lee; + Vector leem; + Vector uee; + Vector ueem; + Vector dee; +public: + CayleyBase(RealD _M5, RealD _mass, int _Ls, RealD b_, RealD c_) : + M5(_M5), + mass(_mass), + Ls(_Ls), + _b(b_), + _c(c_) + { + RealD eps = 1.0; + Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham + this->SetCoefficientsTanh(zdata,1.0,0.0); + Approx::zolotarev_free(zdata); + } + ///////////////////////////////////////////////////////// + // Replicates functionality + // Use a common base class approach + ///////////////////////////////////////////////////////// + // Tanh + void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c) + { + Vector gamma(this->Ls); + for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; + SetCoefficientsInternal(1.0,gamma,b,c); + } + //Zolo + void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c) + { + Vector gamma(this->Ls); + for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; + SetCoefficientsInternal(zolo_hi,gamma,b,c); + } + //Zolo + void SetCoefficientsInternal(RealD zolo_hi,Vector & gamma,RealD b,RealD c) + { + int Ls=this->Ls; + + /////////////////////////////////////////////////////////// + // The Cayley coeffs (unprec) + /////////////////////////////////////////////////////////// + assert(gamma.size()==Ls); + + omega.resize(Ls); + bs.resize(Ls); + cs.resize(Ls); + as.resize(Ls); + + double bpc = b+c; + double bmc = b-c; + _b = b; + _c = c; + _gamma = gamma; // Save the parameters so we can change mass later. + _zolo_hi= zolo_hi; + for(int i=0; i < Ls; i++){ + as[i] = 1.0; + omega[i] = _gamma[i]*_zolo_hi; //NB reciprocal relative to Chroma NEF code + assert(omega[i]!=Coeff_t(0.0)); + bs[i] = 0.5*(bpc/omega[i] + bmc); + cs[i] = 0.5*(bpc/omega[i] - bmc); + } + + //////////////////////////////////////////////////////// + // Constants for the preconditioned matrix Cayley form + //////////////////////////////////////////////////////// + bee.resize(Ls); + cee.resize(Ls); + beo.resize(Ls); + ceo.resize(Ls); + + for(int i=0;iM5) +1.0); + assert(bee[i]!=Coeff_t(0.0)); + cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); + beo[i]=as[i]*bs[i]; + ceo[i]=-as[i]*cs[i]; + } + aee.resize(Ls); + aeo.resize(Ls); + for(int i=0;i &out){assert(0);}; + virtual void DW (const Field &psi, Field &chi)=0; + virtual void DWDag (const Field &psi, Field &chi)=0; + + void M (const Field &psi, Field &chi) + { + Field Din(psi.Grid()); + Meooe5D(psi,Din); + DW(Din,chi); + axpby(chi,1.0,1.0,chi,psi); + M5D(psi,chi); + } + void Mdag (const Field &psi, Field &chi) + { + Field Din(psi.Grid()); + DWDag(psi,Din); + MeooeDag5D(Din,chi); + M5Ddag(psi,chi); + axpby (chi,1.0,1.0,chi,psi); + } + ///////////////////////////////// + // P and Pdag - might be needed + ///////////////////////////////// + void P(const Field &psi, Field &chi) + { + int Ls= this->Ls; + chi=Zero(); + for(int s=0;sLs; + chi=Zero(); + for(int s=0;sLs; + Vector diag (Ls,1.0); + Vector upper(Ls,-1.0); upper[Ls-1]=mass; + Vector lower(Ls,-1.0); lower[0] =mass; + M5D(psi,chi,chi,lower,diag,upper); + } + void M5Ddag (const Field &psi, Field &chi) + { + int Ls=this->Ls; + Vector diag(Ls,1.0); + Vector upper(Ls,-1.0); + Vector lower(Ls,-1.0); + upper[Ls-1]=-mass*upper[Ls-1]; + lower[0] =-mass*lower[0]; + M5Ddag(psi,chi,chi,lower,diag,upper); + } + void Meooe5D (const Field &psi, Field &Din) + { + int Ls=this->Ls; + Vector diag = bs; + Vector upper= cs; + Vector lower= cs; + upper[Ls-1]=-mass*upper[Ls-1]; + lower[0] =-mass*lower[0]; + M5D(psi,psi,Din,lower,diag,upper); + } + void MeooeDag5D (const Field &psi, Field &Din) + { + int Ls=this->Ls; + Vector diag =bs; + Vector upper=cs; + Vector lower=cs; + + for (int s=0;s &lower, + Vector &diag, + Vector &upper) + { + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); + assert(phi.Checkerboard() == psi.Checkerboard()); + + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + + int Ls =this->Ls; + + // 10 = 3 complex mult + 2 complex add + // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) + uint64_t nloop = grid->oSites()/Ls; + + const int Nsimd = Field::vector_type::Nsimd(); + accelerator_for(sss,nloop,Nsimd,{ + uint64_t ss= sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp1, tmp2; + for(int s=0;s &lower, + Vector &diag, + Vector &upper) + { + chi_i.Checkerboard()=psi_i.Checkerboard(); + GridBase *grid=psi_i.Grid(); + autoView(psi , psi_i,AcceleratorRead); + autoView(phi , phi_i,AcceleratorRead); + autoView(chi , chi_i,AcceleratorWrite); + assert(phi.Checkerboard() == psi.Checkerboard()); + + auto pdiag = &diag[0]; + auto pupper = &upper[0]; + auto plower = &lower[0]; + + int Ls=this->Ls; + + uint64_t nloop = grid->oSites()/Ls; + const int Nsimd = Field::vector_type::Nsimd(); + accelerator_for(sss,nloop,Nsimd,{ + uint64_t ss=sss*Ls; + typedef decltype(coalescedRead(psi[0])) spinor; + spinor tmp1,tmp2; + for(int s=0;s +class CoarseCayleyFermion : public CayleyBase< Lattice > , ComplexD > +{ +public: + typedef iVector siteVector; + typedef Lattice CoarseComplexField; + typedef Lattice CoarseVector; + typedef Lattice > CoarseMatrix; + typedef iMatrix Cobj; + typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field + typedef Lattice FineField; + + // Similar to the CoarseOperator but add 5D support. + Geometry geom; + GridBase *Coarse5D; + GridBase *Coarse4D; + CartesianStencil Stencil; + CoarsenedMatrix &Dw; + + GridBase * Grid(void) { return Coarse5D; }; // this is all the linalg routines need to know + + CoarseCayleyFermion(GridCartesian &CoarseGrid4, + GridCartesian &CoarseGrid5, + CoarsenedMatrix &_Dw, + RealD M5, RealD mass, int Ls, RealD b, RealD c) : + CayleyBase(M5,mass,Ls,b,c), + Coarse4D(&CoarseGrid4), + Coarse5D(&CoarseGrid5), + Dw(_Dw), + geom(CoarseGrid5._ndimension), + Stencil( &CoarseGrid5,geom.npoint,Even,geom.directions,geom.displacements,0) + { + }; + +public: + void Project( CoarseVector &C ) + { + const int Nsimd = CComplex::Nsimd(); + autoView(Cv,C, AcceleratorWrite); + int Ls = this->Ls; + for(int s=0;soSites(), Nsimd, { + int sF= sU*Ls+s; + auto tmp = coalescedRead(Cv[sF]); + coalescedWrite(Cv[sF],tmp); + }); + } + } + //////////////////////////////////////////////// + // This is specific to Coarse Grid Cayley + //////////////////////////////////////////////// + virtual void Mdiag (const CoarseVector &in, CoarseVector &out) + { + std::vector allout(9,in.Grid()); + this->MdirAll(in,allout); + out = allout[8]; + } + virtual void Mdir (const CoarseVector &in, CoarseVector &out,int dir, int disp) + { + assert(0); + } + virtual void MdirAll (const CoarseVector &in, std::vector &out) + { + conformable(Coarse5D,in.Grid()); + + SimpleCompressor compressor; + + Stencil.HaloExchange(in,compressor); + typedef LatticeView Aview; + + const int Nsimd = CComplex::Nsimd(); + + // Ls loop for2D + int Ls=this->Ls; + + siteVector *CBp=Stencil.CommBuf(); + + int ptype; + int nb2=nbasis/2; + + autoView(in_v , in, AcceleratorRead); + autoView(st, Stencil, AcceleratorRead); + for(int point=0;pointoSites(), b, nbasis, Nsimd, { + + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + int sU = sF/Ls; + int s = sF%Ls; + + calcComplex res = Zero(); + calcVector nbr; + int ptype; + + StencilEntry *SE=st.GetEntry(ptype,point,sF); + + if(SE->_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(CBp[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb compressor; + + Stencil.HaloExchange(in,compressor); + typedef LatticeView Aview; + + const int Nsimd = CComplex::Nsimd(); + + // Ls loop for2D + int Ls=this->Ls; + + Vector AcceleratorViewContainer; + for(int p=0;poSites(), b, nbasis, Nsimd, { + + typedef decltype(coalescedRead(in_v[0])) calcVector; + typedef decltype(coalescedRead(in_v[0](0))) calcComplex; + int sU = sF/Ls; + int s = sF%Ls; + + calcComplex res = Zero(); + + { + calcVector nbr; + int ptype; + + for(int point=0;point_is_local) { + nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); + } else { + nbr = coalescedRead(CBp[SE->_offset]); + } + acceleratorSynchronise(); + + for(int bb=0;bb Aggregates; + + void PromoteFromSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) + { + auto FineGrid4 = _Aggregates.FineGrid; + FineField F4(FineGrid4); + CoarseVector C4(Coarse4D); + for(int s=0;sLs;s++){ + ExtractSlice(C4,C,s,0); + _Aggregates.PromoteFromSubspace(C4,F4); + InsertSlice(F4,F,s,0); + } + } + void ProjectToSubspace(Aggregates &_Aggregates,CoarseVector &C,FineField &F) + { + auto FineGrid4 = _Aggregates.FineGrid; + FineField F4(FineGrid4); + CoarseVector C4(Coarse4D); + for(int s=0;sLs;s++){ + ExtractSlice(F4,F,s,0); + _Aggregates.ProjectToSubspace (C4,F4); + InsertSlice(C4,C,s,0); + } + Project(C); + } + template + void Test(Aggregates &_Aggregates,GridBase *FineGrid, Ddwf &_Ddwf) + { + typedef Lattice FineField; + CoarseVector Cin(Coarse5D); + CoarseVector Cout(Coarse5D); + CoarseVector CFout(Coarse5D); + + FineField Fin(FineGrid); + FineField Fout(FineGrid); + + + std::vector seeds({1,2,3,4,5}); + GridParallelRNG RNG(Coarse5D); RNG.SeedFixedIntegers(seeds); + + gaussian(RNG,Cin); + PromoteFromSubspace(_Aggregates,Cin,Fin); + ProjectToSubspace(_Aggregates,Cin,Fin); + + std::cout << GridLogMessage<< "************ "<M(Cin,Cout); + this->Project(Cout); + std::cout << GridLogMessage<< " Cout "<Mdag(Cin,Cout); + this->Project(Cout); + std::cout << GridLogMessage<< " Cout "< Directions(void) { return geom.directions;}; + virtual std::vector Displacements(void){ return geom.displacements;}; +}; + +template class SchurSolverWrapper : public LinearFunction { +private: + CheckerBoardedSparseMatrixBase & _Matrix; + SchurRedBlackBase & _Solver; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SchurSolverWrapper(CheckerBoardedSparseMatrixBase &Matrix, + SchurRedBlackBase &Solver) + : _Matrix(Matrix), _Solver(Solver) {}; + + void operator() (const Field &in, Field &out){ + + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +template class SolverWrapper : public LinearFunction { +private: + LinearOperatorBase & _Matrix; + OperatorFunction & _Solver; + LinearFunction & _Guess; +public: + + ///////////////////////////////////////////////////// + // Wrap the usual normal equations trick + ///////////////////////////////////////////////////// + SolverWrapper(LinearOperatorBase &Matrix, + OperatorFunction &Solver, + LinearFunction &Guess) + : _Matrix(Matrix), _Solver(Solver), _Guess(Guess) {}; + + void operator() (const Field &in, Field &out){ + + _Guess(in,out); + _Solver(_Matrix,in,out); // Mdag M out = Mdag in + + } +}; + +// Must use a non-hermitian solver +template +class PVdagMLinearOperator : public LinearOperatorBase { + Matrix &_Mat; + Matrix &_PV; +public: + PVdagMLinearOperator(Matrix &Mat,Matrix &PV): _Mat(Mat),_PV(PV){}; + + virtual std::vector Directions(void) { return _Mat.Directions();}; + virtual std::vector Displacements(void){ return _Mat.Displacements();}; + + void OpDiag (const Field &in, Field &out) { + assert(0); + } + void OpDir (const Field &in, Field &out,int dir,int disp) { + assert(0); + } + void OpDirAll (const Field &in, std::vector &out){ + assert(0); + }; + void Op (const Field &in, Field &out){ + Field tmp(in.Grid()); + _Mat.M(in,tmp); + _PV.Mdag(tmp,out); + } + void AdjOp (const Field &in, Field &out){ + Field tmp(in.Grid()); + _PV.M(tmp,out); + _Mat.Mdag(in,tmp); + } + void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ + assert(0); + } + void HermOp(const Field &in, Field &out){ + assert(0); + } +}; + +RealD InverseApproximation(RealD x){ + return 1.0/x; +} + +template class ChebyshevSmoother : public LinearFunction +{ +public: + typedef LinearOperatorBase FineOperator; + Matrix & _SmootherMatrix; + FineOperator & _SmootherOperator; + + Chebyshev Cheby; + + ChebyshevSmoother(RealD _lo,RealD _hi,int _ord, FineOperator &SmootherOperator,Matrix &SmootherMatrix) : + _SmootherOperator(SmootherOperator), + _SmootherMatrix(SmootherMatrix), + Cheby(_lo,_hi,_ord,InverseApproximation) + {}; + + void operator() (const Field &in, Field &out) + { + Field tmp(in.Grid()); + MdagMLinearOperator MdagMOp(_SmootherMatrix); + _SmootherOperator.AdjOp(in,tmp); + Cheby(MdagMOp,tmp,out); + } +}; +template +class MGPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + typedef CoarseCayleyFermion CoarseOperator; + // typedef SparseMatrixBase CoarseOperator; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _PreSmoother; + FineSmoother & _PostSmoother; + CoarseOperator & _CoarseOperator; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + MGPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &PreSmoother, + FineSmoother &PostSmoother, + CoarseOperator &CoarseOperator_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _PreSmoother(PreSmoother), + _PostSmoother(PostSmoother), + _CoarseOperator(CoarseOperator_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _CoarseOperator.Grid(); + CoarseVector Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + std::cout< +class HDCRPreconditioner : public LinearFunction< Lattice > { +public: + + typedef Aggregation Aggregates; + typedef typename Aggregation::CoarseVector CoarseVector; + typedef typename Aggregation::CoarseMatrix CoarseMatrix; + typedef typename Aggregation::FineField FineField; + typedef LinearOperatorBase FineOperator; + typedef LinearFunction FineSmoother; + //typedef CoarseCayleyFermion CoarseOperator; + typedef SparseMatrixBase CoarseOperator; + + Aggregates & _Aggregates; + FineOperator & _FineOperator; + FineSmoother & _PreSmoother; + FineSmoother & _PostSmoother; + CoarseOperator & _CoarseOperator; + CoarseSolver & _CoarseSolve; + + int level; void Level(int lv) {level = lv; }; + + HDCRPreconditioner(Aggregates &Agg, + FineOperator &Fine, + FineSmoother &PreSmoother, + FineSmoother &PostSmoother, + CoarseOperator &CoarseOperator_, + CoarseSolver &CoarseSolve_) + : _Aggregates(Agg), + _FineOperator(Fine), + _PreSmoother(PreSmoother), + _PostSmoother(PostSmoother), + _CoarseOperator(CoarseOperator_), + _CoarseSolve(CoarseSolve_), + level(1) { } + + virtual void operator()(const FineField &in, FineField & out) + { + auto CoarseGrid = _CoarseOperator.Grid(); + CoarseVector Csrc(CoarseGrid); + CoarseVector g5Csrc(CoarseGrid); + CoarseVector Csol(CoarseGrid); + FineField vec1(in.Grid()); + FineField vec2(in.Grid()); + + std::cout< block ({2,2,2,2}); // 4,2,2,2 gets worse + std::vector blockc ({1,1,1,1}); + const int nbasis= 24; + const int nbasisc= 40; // decrease, not improvement + + auto clatt = GridDefaultLatt(); + for(int d=0;d seeds({1,2,3,4}); + GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds); + GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds); + GridParallelRNG CRNG(Coarse4d);CRNG.SeedFixedIntegers(seeds); + + LatticeGaugeField Umu(UGrid); +#if 0 + SU3::TepidConfiguration(RNG4,Umu); + RealD M5=1.0; +#else + std::string file("./ckpoint_lat.1000"); + FieldMetaData header; + NerscIO::readConfiguration(Umu,header,file); + RealD M5=1.8; +#endif + + std::cout< Subspace; + typedef CoarsenedMatrix CoarseOperator; + typedef CoarseOperator::CoarseVector CoarseVector; + typedef CoarseOperator::siteVector siteVector; + + std::cout< MdagM_Dw(Dw_null); + + std::cout< WilsonCG(1.0e-10,40000); + LatticeFermion w_src(UGrid); w_src=1.0; + LatticeFermion w_res(UGrid); + WilsonCG(MdagM_Dw,w_src,w_res); + exit(0); + */ + std::cout< Level1Op4; + typedef CoarseCayleyFermion Level1Op5; + Level1Op4 c_Dw (*Coarse4d,0); + NonHermitianLinearOperator LinOpDw(Dw); + c_Dw.CoarsenOperator(UGrid,LinOpDw,Aggregates4D); // contains the M5 from Dw(-M5) + // c_Dw.Test(Aggregates4D,UGrid,LinOpDw); + + std::cout< MdagM_cDwf(c_Dwf); + + std::cout<,nbasisc> Level2Op; + typedef Aggregation,nbasisc> CoarseSubspace; + CoarseSubspace CoarseAggregates(CoarseCoarse5d,Coarse5d,0); + + std::cout< L1Hdwf(c_Dwf); + Level2Op cc_Dwf (*CoarseCoarse5d,*CoarseCoarse5dRB,1); // say it is hermitian + cc_Dwf.CoarsenOperator(Coarse5d,L1Hdwf,CoarseAggregates); + // cc_Dwf.Test(CoarseAggregates,Coarse5d,L1Hdwf); + + typedef Level2Op::CoarseVector CoarseCoarseVector; + + std::cout< CoarseCG(tol,MaxIt); + ConjugateGradient FineCG(tol,MaxIt); + + NonHermitianLinearOperator FineM(Ddwf); + MdagMLinearOperator FineMdagM(Ddwf); // M^\dag M + + NonHermitianLinearOperator CoarseM(c_Dwf); + MdagMLinearOperator CoarseMdagM(c_Dwf); + + NonHermitianLinearOperator CoarseCoarseM(cc_Dwf); + MdagMLinearOperator CoarseCoarseMdagM(cc_Dwf); + + + std::cout< PM; PM(MdagM_Dw,w_src); + std::cout< cPM; cPM(CoarseMdagM,c_src); + + cc_src=1.0; + PowerMethod ccPM; ccPM(CoarseCoarseMdagM,cc_src); + + std::cout< IRLHermOpL2(cc_Dwf); + Chebyshev IRLChebyL2(IRL_lo,IRL_hi,IRL_ord); + FunctionHermOp IRLOpChebyL2(IRLChebyL2,IRLHermOpL2); + PlainHermOp IRLOpL2 (IRLHermOpL2); + ImplicitlyRestartedLanczos IRLL2(IRLOpChebyL2,IRLOpL2,cNstop,cNk,cNm,1.0e-3,20); + + int cNconv; + cNm=0; + std::vector eval2(cNm); + std::vector evec2(cNm,CoarseCoarse5d); + cc_src=1.0; + // IRLL2.calc(eval2,evec2,cc_src,cNconv); + + std::vector tols ({0.005,0.001}); + std::vector c_los ({0.1,0.05}); + std::vector c_his ({22.0}); + std::vector f_los ({0.5,0.2}); + std::vector f_his ({60.0}); + std::vector ws ({2,3}); + std::vector c_ords ({32,24}); + std::vector f_ords ({20,16}); + + for(auto w : ws ) { + for(auto tol : tols ) { + for(auto f_ord : f_ords ) { + for(auto c_ord : c_ords ) { + for(auto c_lo : c_los ) { + for(auto c_hi : c_his ) { + for(auto f_lo : f_los ) { + for(auto f_hi : f_his ) { + ZeroGuesser CoarseZeroGuesser; + ZeroGuesser CoarseCoarseZeroGuesser; + ConjugateGradient CoarseCoarseCG(tol,10000); + ZeroGuesser CoarseCoarseGuesser; + SchurRedBlackDiagMooeeSolve CoarseCoarseRBCG(CoarseCoarseCG); + SchurSolverWrapper CoarseCoarseSolver(cc_Dwf,CoarseCoarseRBCG); + + std::cout< CoarseCoarseCGNE(cc_Dwf,CoarseCoarseCG,CoarseCoarseZeroGuesser); + { +typedef HDCRPreconditioner,nbasisc,LinearFunction > CoarseMG; + typedef MGPreconditioner > ThreeLevelMG; + + // MultiGrid preconditioner acting on the coarse space <-> coarsecoarse space + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,c_ord,CoarseM,c_Dwf); // 37s, 26 iter + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,c_ord,CoarseM,c_Dwf); + ChebyshevSmoother CoarseSmoother(c_lo,c_hi,c_ord,CoarseM,c_Dwf); // 37s, 26 iter + + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,7,CoarseM,c_Dwf); // 38s, 26 iter + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,7,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.4,22.0,7,CoarseM,c_Dwf); // 41s, 27 iter + // ChebyshevSmoother CoarseSmoother2(0.4,22.0,7,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.6,22.0,6,CoarseM,c_Dwf); // 26 iter + // ChebyshevSmoother CoarseSmoother2(0.6,22.0,6,CoarseM,c_Dwf); + // ChebyshevSmoother CoarseSmoother1(0.5,22.0,5,CoarseM,c_Dwf); // 33 iter, 55s + // ChebyshevSmoother CoarseSmoother2(0.5,22.0,5,CoarseM,c_Dwf); + + + CoarseMG Level2Precon (CoarseAggregates, + CoarseM, + CoarseSmoother, + CoarseSmoother, + cc_Dwf, + CoarseCoarseSolver); + Level2Precon.Level(2); + + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.5, 100, CoarseM,Level2Precon,16,16); // 26 iter, 37s + // PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2); // 296 s, 50 iter + // PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2); // 250 s, 37 iter + PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.0, 1, CoarseM,Level2Precon,2,2); + + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(1.0, 100, CoarseM,Level2Precon,16,16); // 35 iter, 45s + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.6, 100, CoarseM,Level2Precon,16,16); // 26,38 (diifferene is measurement noise) + //PrecGeneralisedConjugateResidualNonHermitian L2PGCR(0.2, 100, CoarseM,Level2Precon,16,16); // 26 iter, 47s + L2PGCR.Level(2); + + // Wrap the 2nd level solver in a MultiGrid preconditioner acting on the fine space + + // ChebyshevSmoother FineSmoother1(0.5,60.0,14,FineM,Ddwf); // 26 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,14,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 38s + // ChebyshevSmoother FineSmoother2(0.5,60.0,16,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 23 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,20,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,10,FineM,Ddwf);24 iter, 44s + // ChebyshevSmoother FineSmoother2(0.5,60.0,24,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // odd convergence tail at 10^-9 ish + // ChebyshevSmoother FineSmoother2(0.1,60.0,24,FineM,Ddwf); // 33 iter, waas O(10-9 by 26) + + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 25 iter, 39s + // ChebyshevSmoother FineSmoother2(0.5,60.0,18,FineM,Ddwf); // + + ChebyshevSmoother FineSmoother(f_lo,f_hi,f_ord,FineM,Ddwf); + + // ChebyshevSmoother FineSmoother1(0.5,60.0,11,FineM,Ddwf); // 33 iter, 49s + // ChebyshevSmoother FineSmoother2(0.5,60.0,11,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.5,60.0,12,FineM,Ddwf); // 26 iter, 37s + // ChebyshevSmoother FineSmoother2(0.5,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.4,60.0,12,FineM,Ddwf); // iter 26 no change in final residual + // ChebyshevSmoother FineSmoother2(0.4,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,12,FineM,Ddwf); // 27 iter 39s. + // ChebyshevSmoother FineSmoother2(0.3,60.0,12,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(0.3,60.0,13,FineM,Ddwf); // 26 iter, but slower + // ChebyshevSmoother FineSmoother2(0.3,60.0,13,FineM,Ddwf); + // ChebyshevSmoother FineSmoother1(1.0,60.0,12,FineM,Ddwf); // 34 iter, slower + // ChebyshevSmoother FineSmoother2(1.0,60.0,12,FineM,Ddwf); + + ThreeLevelMG ThreeLevelPrecon(Aggregates4D, + FineM, + FineSmoother, + FineSmoother, + c_Dwf, + L2PGCR); + ThreeLevelPrecon.Level(1); + + PrecGeneralisedConjugateResidualNonHermitian L1PGCR(1.0e-8,1000,FineM,ThreeLevelPrecon,16,16); + L1PGCR.Level(1); + + f_res=Zero(); + L1PGCR(f_src,f_res); + } + }}}} + }}} + } + std::cout<::HotConfiguration(pRNG,Umu); std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<::HotConfiguration(pRNG,Umu); std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<::ColdConfiguration(Umu); std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<::HotConfiguration(pRNG,Umu); std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<::ColdConfiguration(Umu); std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<::HotConfiguration(pRNG,Umu); std::cout << GridLogMessage << "Intialised the HOT Gauge Field"<::ColdConfiguration(Umu); std::cout << GridLogMessage << "Intialised the COLD Gauge Field"<::HotConfiguration(pRNG,Umu); ///////////////// // MPI only sends diff --git a/tests/solver/Test_staggered_block_cg_prec.cc b/tests/solver/Test_staggered_block_cg_prec.cc index 2499fc8a..c5306e85 100644 --- a/tests/solver/Test_staggered_block_cg_prec.cc +++ b/tests/solver/Test_staggered_block_cg_prec.cc @@ -87,7 +87,7 @@ int main (int argc, char ** argv) FermionField result_o(FrbGrid); result_o=Zero(); RealD nrm = norm2(src); - LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(UGrid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); RealD mass=0.003; RealD c1=9.0/8.0; diff --git a/tests/solver/Test_staggered_cagmres_unprec.cc b/tests/solver/Test_staggered_cagmres_unprec.cc index 8121c90d..1b7a2f56 100644 --- a/tests/solver/Test_staggered_cagmres_unprec.cc +++ b/tests/solver/Test_staggered_cagmres_unprec.cc @@ -51,7 +51,7 @@ int main (int argc, char ** argv) FermionField src(&Grid); random(pRNG,src); RealD nrm = norm2(src); FermionField result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); FermionField src(&Grid); random(pRNG,src); FermionField result(&Grid); result=Zero(); diff --git a/tests/solver/Test_staggered_cg_unprec.cc b/tests/solver/Test_staggered_cg_unprec.cc index 9625a9c8..e023b910 100644 --- a/tests/solver/Test_staggered_cg_unprec.cc +++ b/tests/solver/Test_staggered_cg_unprec.cc @@ -65,7 +65,7 @@ int main (int argc, char ** argv) FermionField src(&Grid); random(pRNG,src); RealD nrm = norm2(src); FermionField result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); diff --git a/tests/solver/Test_wilson_cg_schur.cc b/tests/solver/Test_wilson_cg_schur.cc index 23383032..97482131 100644 --- a/tests/solver/Test_wilson_cg_schur.cc +++ b/tests/solver/Test_wilson_cg_schur.cc @@ -57,7 +57,7 @@ int main (int argc, char ** argv) std::vector seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); LatticeFermion src(&Grid); random(pRNG,src); LatticeFermion result(&Grid); result=Zero(); diff --git a/tests/solver/Test_wilson_cg_unprec.cc b/tests/solver/Test_wilson_cg_unprec.cc index f3335d45..07f6ba7b 100644 --- a/tests/solver/Test_wilson_cg_unprec.cc +++ b/tests/solver/Test_wilson_cg_unprec.cc @@ -60,7 +60,7 @@ int main (int argc, char ** argv) LatticeFermion src(&Grid); random(pRNG,src); RealD nrm = norm2(src); LatticeFermion result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); diff --git a/tests/solver/Test_wilson_fcagmres_prec.cc b/tests/solver/Test_wilson_fcagmres_prec.cc index b821a25f..d2a1acf4 100644 --- a/tests/solver/Test_wilson_fcagmres_prec.cc +++ b/tests/solver/Test_wilson_fcagmres_prec.cc @@ -47,7 +47,7 @@ int main (int argc, char ** argv) LatticeFermion src(&Grid); random(pRNG,src); RealD nrm = norm2(src); LatticeFermion result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(fPRNG, Umu); // clang-format on RealD mass = -0.25; diff --git a/tests/solver/Test_wilson_mg_mp.cc b/tests/solver/Test_wilson_mg_mp.cc index e631cd15..89bbbf74 100644 --- a/tests/solver/Test_wilson_mg_mp.cc +++ b/tests/solver/Test_wilson_mg_mp.cc @@ -52,7 +52,7 @@ int main(int argc, char **argv) { LatticeFermionD src_d(FGrid_d); gaussian(fPRNG, src_d); LatticeFermionD resultMGD_d(FGrid_d); resultMGD_d = Zero(); LatticeFermionD resultMGF_d(FGrid_d); resultMGF_d = Zero(); - LatticeGaugeFieldD Umu_d(FGrid_d); SU3::HotConfiguration(fPRNG, Umu_d); + LatticeGaugeFieldD Umu_d(FGrid_d); SU::HotConfiguration(fPRNG, Umu_d); LatticeGaugeFieldF Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d); // clang-format on diff --git a/tests/solver/Test_wilson_mr_unprec.cc b/tests/solver/Test_wilson_mr_unprec.cc index 1cc1f418..fef83794 100644 --- a/tests/solver/Test_wilson_mr_unprec.cc +++ b/tests/solver/Test_wilson_mr_unprec.cc @@ -47,7 +47,7 @@ int main (int argc, char ** argv) LatticeFermion src(&Grid); random(pRNG,src); RealD nrm = norm2(src); LatticeFermion result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(RNG4,Umu); std::vector U(4,Grid); diff --git a/tests/solver/Test_wilsonclover_bicgstab_prec.cc b/tests/solver/Test_wilsonclover_bicgstab_prec.cc index c1905400..b382b1bb 100644 --- a/tests/solver/Test_wilsonclover_bicgstab_prec.cc +++ b/tests/solver/Test_wilsonclover_bicgstab_prec.cc @@ -60,7 +60,7 @@ int main (int argc, char ** argv) LatticeFermion src(&Grid); random(pRNG,src); RealD nrm = norm2(src); LatticeFermion result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); std::vector U(4,&Grid); diff --git a/tests/solver/Test_wilsonclover_cg_schur.cc b/tests/solver/Test_wilsonclover_cg_schur.cc index eaae24b3..567a8283 100644 --- a/tests/solver/Test_wilsonclover_cg_schur.cc +++ b/tests/solver/Test_wilsonclover_cg_schur.cc @@ -57,7 +57,7 @@ int main (int argc, char ** argv) std::vector seeds({1,2,3,4}); GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); LatticeFermion src(&Grid); random(pRNG,src); LatticeFermion result(&Grid); result=Zero(); diff --git a/tests/solver/Test_wilsonclover_cg_unprec.cc b/tests/solver/Test_wilsonclover_cg_unprec.cc index 49c52cdf..755d80e1 100644 --- a/tests/solver/Test_wilsonclover_cg_unprec.cc +++ b/tests/solver/Test_wilsonclover_cg_unprec.cc @@ -60,7 +60,7 @@ int main (int argc, char ** argv) LatticeFermion src(&Grid); random(pRNG,src); RealD nrm = norm2(src); LatticeFermion result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(fPRNG, Umu); // clang-format on RealD mass = -0.25; diff --git a/tests/solver/Test_wilsonclover_mg_lime.cc b/tests/solver/Test_wilsonclover_mg_lime.cc index bd2990d4..0a29c034 100644 --- a/tests/solver/Test_wilsonclover_mg_lime.cc +++ b/tests/solver/Test_wilsonclover_mg_lime.cc @@ -75,7 +75,7 @@ int main(int argc, char **argv) { NerscIO::readConfiguration(Umu_d,header,file); } #endif - // SU3::HotConfiguration(fPRNG, Umu_d); + // SU::HotConfiguration(fPRNG, Umu_d); LatticeGaugeFieldF Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d); // clang-format on diff --git a/tests/solver/Test_wilsonclover_mg_mp.cc b/tests/solver/Test_wilsonclover_mg_mp.cc index b5178d2e..2efe5f08 100644 --- a/tests/solver/Test_wilsonclover_mg_mp.cc +++ b/tests/solver/Test_wilsonclover_mg_mp.cc @@ -52,7 +52,7 @@ int main(int argc, char **argv) { LatticeFermionD src_d(FGrid_d); gaussian(fPRNG, src_d); LatticeFermionD resultMGD_d(FGrid_d); resultMGD_d = zero; LatticeFermionD resultMGF_d(FGrid_d); resultMGF_d = zero; - LatticeGaugeFieldD Umu_d(FGrid_d); SU3::HotConfiguration(fPRNG, Umu_d); + LatticeGaugeFieldD Umu_d(FGrid_d); SU::HotConfiguration(fPRNG, Umu_d); LatticeGaugeFieldF Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d); // clang-format on diff --git a/tests/solver/Test_wilsonclover_mixedbicgstab_prec.cc b/tests/solver/Test_wilsonclover_mixedbicgstab_prec.cc index 0af83f8b..d47dac2a 100644 --- a/tests/solver/Test_wilsonclover_mixedbicgstab_prec.cc +++ b/tests/solver/Test_wilsonclover_mixedbicgstab_prec.cc @@ -61,7 +61,7 @@ int main (int argc, char ** argv) // clang-format off LatticeFermionD src(FGrid_d); gaussian(fPRNG, src); LatticeFermionD result(FGrid_d); result = Zero(); - LatticeGaugeFieldD Umu_d(FGrid_d); SU3::HotConfiguration(fPRNG, Umu_d); + LatticeGaugeFieldD Umu_d(FGrid_d); SU::HotConfiguration(fPRNG, Umu_d); LatticeGaugeFieldF Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d); // clang-format on diff --git a/tests/solver/Test_wilsonclover_mixedcg_prec.cc b/tests/solver/Test_wilsonclover_mixedcg_prec.cc index 8af9036f..95590004 100644 --- a/tests/solver/Test_wilsonclover_mixedcg_prec.cc +++ b/tests/solver/Test_wilsonclover_mixedcg_prec.cc @@ -61,7 +61,7 @@ int main (int argc, char ** argv) // clang-format off LatticeFermionD src(FGrid_d); gaussian(fPRNG, src); LatticeFermionD result(FGrid_d); result = Zero(); - LatticeGaugeFieldD Umu_d(FGrid_d); SU3::HotConfiguration(fPRNG, Umu_d); + LatticeGaugeFieldD Umu_d(FGrid_d); SU::HotConfiguration(fPRNG, Umu_d); LatticeGaugeFieldF Umu_f(FGrid_f); precisionChange(Umu_f, Umu_d); // clang-format on diff --git a/tests/solver/Test_wilsonclover_mr_unprec.cc b/tests/solver/Test_wilsonclover_mr_unprec.cc index c7b5ecfe..ab49ec1f 100644 --- a/tests/solver/Test_wilsonclover_mr_unprec.cc +++ b/tests/solver/Test_wilsonclover_mr_unprec.cc @@ -51,7 +51,7 @@ int main (int argc, char ** argv) FermionField src(&Grid); random(pRNG,src); RealD nrm = norm2(src); FermionField result(&Grid); result=Zero(); - LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu); + LatticeGaugeField Umu(&Grid); SU::HotConfiguration(pRNG,Umu); double volume=1; for(int mu=0;mu::HotConfiguration(RNG4, Umu); } std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() diff --git a/tests/solver/Test_zmobius_cg_prec.cc b/tests/solver/Test_zmobius_cg_prec.cc index fb57cff1..6b007afc 100644 --- a/tests/solver/Test_zmobius_cg_prec.cc +++ b/tests/solver/Test_zmobius_cg_prec.cc @@ -67,7 +67,7 @@ int main(int argc, char** argv) { result = Zero(); LatticeGaugeField Umu(UGrid); - SU3::HotConfiguration(RNG4, Umu); + SU::HotConfiguration(RNG4, Umu); std::cout << GridLogMessage << "Lattice dimensions: " << GridDefaultLatt() << " Ls: " << Ls << std::endl;