From 9296299b61508e8fb4a44fa32f656b7ab1857d9e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 4 Oct 2022 11:10:34 -0700 Subject: [PATCH 01/23] Better commenting --- Grid/qcd/action/fermion/WilsonCompressor.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Grid/qcd/action/fermion/WilsonCompressor.h b/Grid/qcd/action/fermion/WilsonCompressor.h index e0e08c1c..5c3351a6 100644 --- a/Grid/qcd/action/fermion/WilsonCompressor.h +++ b/Grid/qcd/action/fermion/WilsonCompressor.h @@ -117,19 +117,19 @@ public: typedef decltype(coalescedRead(*in)) sobj; typedef decltype(coalescedRead(*out0)) hsobj; - unsigned int Nsimd = vobj::Nsimd(); + constexpr unsigned int Nsimd = vobj::Nsimd(); unsigned int mask = Nsimd >> (type + 1); int lane = acceleratorSIMTlane(Nsimd); int j0 = lane &(~mask); // inner coor zero int j1 = lane |(mask) ; // inner coor one - const vobj *vp0 = &in[k]; - const vobj *vp1 = &in[m]; - const vobj *vp = (lane&mask) ? vp1:vp0; - auto sa = coalescedRead(*vp,j0); - auto sb = coalescedRead(*vp,j1); + const vobj *vp0 = &in[k]; // out0[j] = merge low bit of type from in[k] and in[m] + const vobj *vp1 = &in[m]; // out1[j] = merge hi bit of type from in[k] and in[m] + const vobj *vp = (lane&mask) ? vp1:vp0;// if my lane has high bit take vp1, low bit take vp0 + auto sa = coalescedRead(*vp,j0); // lane to read for out 0, NB 50% read coalescing + auto sb = coalescedRead(*vp,j1); // lane to read for out 1 hsobj psa, psb; - projector::Proj(psa,sa,mu,dag); - projector::Proj(psb,sb,mu,dag); + projector::Proj(psa,sa,mu,dag); // spin project the result0 + projector::Proj(psb,sb,mu,dag); // spin project the result1 coalescedWrite(out0[j],psa); coalescedWrite(out1[j],psb); #else From e1e5c7502308e61b3649c87df56e1f3f31c7b90f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 4 Oct 2022 11:11:10 -0700 Subject: [PATCH 02/23] Stencil gather improvements - SVM was running slow and used for a pointer array that wasn't needed to be in SVM --- Grid/stencil/Stencil.h | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index c2bc8dab..65d878cb 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -80,11 +80,14 @@ void Gather_plane_simple_table (commVector >& table,const Lat /////////////////////////////////////////////////////////////////// template void Gather_plane_exchange_table(const Lattice &rhs, - commVector pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline)); + commVector pointers, + int dimension,int plane, + int cbmask,compressor &compress,int type) __attribute__((noinline)); template -void Gather_plane_exchange_table(commVector >& table,const Lattice &rhs, - Vector pointers,int dimension,int plane,int cbmask, +void Gather_plane_exchange_table(commVector >& table, + const Lattice &rhs, + std::vector &pointers,int dimension,int plane,int cbmask, compressor &compress,int type) { assert( (table.size()&0x1)==0); @@ -92,14 +95,15 @@ void Gather_plane_exchange_table(commVector >& table,const La int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane auto rhs_v = rhs.View(AcceleratorRead); + auto rhs_p = &rhs_v[0]; auto p0=&pointers[0][0]; auto p1=&pointers[1][0]; auto tp=&table[0]; accelerator_forNB(j, num, vobj::Nsimd(), { - compress.CompressExchange(p0,p1, &rhs_v[0], j, - so+tp[2*j ].second, - so+tp[2*j+1].second, - type); + compress.CompressExchange(p0,p1, rhs_p, j, + so+tp[2*j ].second, + so+tp[2*j+1].second, + type); }); rhs_v.ViewClose(); } @@ -230,8 +234,8 @@ public: }; struct Merge { cobj * mpointer; - Vector rpointers; - Vector vpointers; + // std::vector rpointers; + std::vector vpointers; Integer buffer_size; Integer type; }; @@ -406,6 +410,7 @@ public: comms_bytes+=bytes; shm_bytes +=2*Packets[i].bytes-bytes; } + _grid->StencilBarrier();// Synch shared memory on a single nodes } void CommunicateComplete(std::vector > &reqs) @@ -420,7 +425,7 @@ public: //////////////////////////////////////////////////////////////////////// void Communicate(void) { - if ( CartesianCommunicator::CommunicatorPolicy == CartesianCommunicator::CommunicatorPolicySequential ){ + if ( 0 ){ thread_region { // must be called in parallel region int mythread = thread_num(); @@ -569,7 +574,7 @@ public: d.buffer_size = buffer_size; dv.push_back(d); } - void AddMerge(cobj *merge_p,Vector &rpointers,Integer buffer_size,Integer type,std::vector &mv) { + void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer type,std::vector &mv) { Merge m; m.type = type; m.mpointer = merge_p; @@ -582,6 +587,7 @@ public: } template void CommsMergeSHM(decompressor decompress) { mpi3synctime-=usecond(); + accelerator_barrier(); _grid->StencilBarrier();// Synch shared memory on a single nodes mpi3synctime+=usecond(); shmmergetime-=usecond(); @@ -1114,8 +1120,8 @@ public: int bytes = (reduced_buffer_size*datum_bytes)/simd_layout; assert(bytes*simd_layout == reduced_buffer_size*datum_bytes); - Vector rpointers(maxl); - Vector spointers(maxl); + std::vector rpointers(maxl); + std::vector spointers(maxl); /////////////////////////////////////////// // Work out what to send where From 03508448f88c08cfa146e5b0082b930d5de503d7 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 4 Oct 2022 11:12:15 -0700 Subject: [PATCH 03/23] Remove verbose --- Grid/threads/Accelerator.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h index d2cf8a01..e17e85d1 100644 --- a/Grid/threads/Accelerator.h +++ b/Grid/threads/Accelerator.h @@ -298,14 +298,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) { }); \ }); -#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n"); theGridAccelerator->wait(); } +#define accelerator_barrier(dummy) { theGridAccelerator->wait(); } inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);}; inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);}; inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);}; inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);}; -inline void acceleratorCopySynchronise(void) { printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); } +inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); } inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();} From 413312f9a9a98634363f5c5206be19e97ec55fca Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 4 Oct 2022 11:12:59 -0700 Subject: [PATCH 04/23] Benchmark the halo construction. THe bye counts are out and should be doubled for SIMD directions --- benchmarks/Benchmark_halo.cc | 131 +++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 benchmarks/Benchmark_halo.cc diff --git a/benchmarks/Benchmark_halo.cc b/benchmarks/Benchmark_halo.cc new file mode 100644 index 00000000..43138e67 --- /dev/null +++ b/benchmarks/Benchmark_halo.cc @@ -0,0 +1,131 @@ + /************************************************************************************* + Grid physics library, www.github.com/paboyle/Grid + Source file: ./benchmarks/Benchmark_dwf.cc + Copyright (C) 2015 + + Author: Peter Boyle + Author: paboyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#ifdef GRID_CUDA +#define CUDA_PROFILE +#endif + +#ifdef CUDA_PROFILE +#include +#endif + +using namespace std; +using namespace Grid; + +template +struct scal { + d internal; +}; + + Gamma::Algebra Gmu [] = { + Gamma::Algebra::GammaX, + Gamma::Algebra::GammaY, + Gamma::Algebra::GammaZ, + Gamma::Algebra::GammaT + }; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + Coordinate latt4= GridDefaultLatt(); + Coordinate mpi = GridDefaultMpi(); + Coordinate simd = GridDefaultSimd(Nd,vComplexF::Nsimd()); + + GridLogLayout(); + + int Ls=16; + for(int i=0;i> Ls; + } + + + GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4,simd ,mpi); + GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); + GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); + GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); + + std::cout << GridLogMessage << "Making s innermost grids"< seeds4({1,2,3,4}); + std::vector seeds5({5,6,7,8}); + + std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; + GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); + std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; + GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); + std::cout << GridLogMessage << "Initialised RNGs" << std::endl; + + LatticeFermionF src (FGrid); random(RNG5,src); + RealD N2 = 1.0/::sqrt(norm2(src)); + src = src*N2; + + std::cout << GridLogMessage << "Drawing gauge field" << std::endl; + LatticeGaugeFieldF Umu(UGrid); + SU::HotConfiguration(RNG4,Umu); + std::cout << GridLogMessage << "Random gauge initialised " << std::endl; + + RealD mass=0.1; + RealD M5 =1.8; + + RealD NP = UGrid->_Nprocessors; + RealD NN = UGrid->NodeCount(); + + DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); + + const int ncall = 500; + std::cout << GridLogMessage<< "*********************************************************" <Barrier(); + Dw.Stencil.HaloExchangeOptGather(src,compressor); + double t0=usecond(); + for(int i=0;iBarrier(); + + double bytes=0.0; + if(mpi[0]) bytes+=latt4[1]*latt4[2]*latt4[3]; + if(mpi[1]) bytes+=latt4[0]*latt4[2]*latt4[3]; + if(mpi[2]) bytes+=latt4[0]*latt4[1]*latt4[3]; + if(mpi[3]) bytes+=latt4[0]*latt4[1]*latt4[2]; + bytes = bytes * Ls * 8.* (24.+12.)* 2.0; + + std::cout< Date: Tue, 25 Oct 2022 15:19:43 +0100 Subject: [PATCH 06/23] feat: changed CompactWilsonExpClover exponentiation to Taylor expansion with Horner scheme. --- Grid/qcd/action/fermion/CloverHelpers.h | 203 ++++-------------- ...CompactWilsonCloverFermionImplementation.h | 11 +- 2 files changed, 50 insertions(+), 164 deletions(-) diff --git a/Grid/qcd/action/fermion/CloverHelpers.h b/Grid/qcd/action/fermion/CloverHelpers.h index 57e71998..c1df81fb 100644 --- a/Grid/qcd/action/fermion/CloverHelpers.h +++ b/Grid/qcd/action/fermion/CloverHelpers.h @@ -242,6 +242,19 @@ public: // mass term is multiplied to exp(Clover) below } + // Can this be avoided? + static void IdentityTimesC(const CloverField& in, RealD c) { + int DimRep = Impl::Dimension; + + autoView(in_v, in, AcceleratorWrite); + + accelerator_for(ss, in.Grid()->oSites(), 1, { + for (int sa=0; sa> &t, RealD R) {return getNMAX(1e-12,R);} - static int getNMAX(Lattice> &t, RealD R) {return getNMAX(1e-6,R);} + static int getNMAX(Lattice> &t, RealD R) {return getNMAX(1e-12,R);} + static int getNMAX(Lattice> &t, RealD R) {return getNMAX(1e-6,R);} - static void ExponentiateHermitean6by6(const iMatrix &arg, const RealD& alpha, const std::vector& cN, const int Niter, iMatrix& dest){ + static void Exponentiate_Clover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) { - typedef iMatrix mat; + GridBase* grid = Clover.Grid(); + CloverField ExpClover(grid); - RealD qn[6]; - RealD qnold[6]; - RealD p[5]; - RealD trA2, trA3, trA4; + int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass); - mat A2, A3, A4, A5; - A2 = alpha * alpha * arg * arg; - A3 = alpha * arg * A2; - A4 = A2 * A2; - A5 = A2 * A3; + Clover *= (1.0/diag_mass); - trA2 = toReal( trace(A2) ); - trA3 = toReal( trace(A3) ); - trA4 = toReal( trace(A4)); - - p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0; - p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0; - p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2; - p[3] = trA3 / 3.0; - p[4] = 0.5 * trA2; - - qnold[0] = cN[Niter]; - qnold[1] = 0.0; - qnold[2] = 0.0; - qnold[3] = 0.0; - qnold[4] = 0.0; - qnold[5] = 0.0; - - for(int i = Niter-1; i >= 0; i--) - { - qn[0] = p[0] * qnold[5] + cN[i]; - qn[1] = p[1] * qnold[5] + qnold[0]; - qn[2] = p[2] * qnold[5] + qnold[1]; - qn[3] = p[3] * qnold[5] + qnold[2]; - qn[4] = p[4] * qnold[5] + qnold[3]; - qn[5] = qnold[4]; - - qnold[0] = qn[0]; - qnold[1] = qn[1]; - qnold[2] = qn[2]; - qnold[3] = qn[3]; - qnold[4] = qn[4]; - qnold[5] = qn[5]; - } - - mat unit(1.0); - - dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5); - - } - - static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) { - - GridBase* grid = Diagonal.Grid(); - int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass); - - // - // Implementation completely in Daniel's layout - // - - // Taylor expansion with Cayley-Hamilton recursion - // underlying Horner scheme as above + // Taylor expansion, slow but generic + // Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...)) + // qN = cN + // qn = cn + qn+1 X std::vector cn(NMAX+1); cn[0] = 1.0; - for (int i=1; i<=NMAX; i++){ + for (int i=1; i<=NMAX; i++) cn[i] = cn[i-1] / RealD(i); + + ExpClover = Zero(); + IdentityTimesC(ExpClover, cn[NMAX]); + for (int i=NMAX-1; i>=0; i--) + ExpClover = ExpClover * Clover + cn[i]; + + // prepare inverse + CloverInv = (-1.0)*Clover; + + Clover = ExpClover * diag_mass; + + ExpClover = Zero(); + IdentityTimesC(ExpClover, cn[NMAX]); + for (int i=NMAX-1; i>=0; i--) + ExpClover = ExpClover * CloverInv + cn[i]; + + CloverInv = ExpClover * (1.0/diag_mass); + } - // Taken over from Daniel's implementation - conformable(Diagonal, Triangle); - - long lsites = grid->lSites(); - { - typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal; - typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle; - typedef iMatrix mat; - - autoView(diagonal_v, Diagonal, CpuRead); - autoView(triangle_v, Triangle, CpuRead); - autoView(diagonalExp_v, Diagonal, CpuWrite); - autoView(triangleExp_v, Triangle, CpuWrite); - - thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite - - mat srcCloverOpUL(0.0); // upper left block - mat srcCloverOpLR(0.0); // lower right block - mat ExpCloverOp; - - scalar_object_diagonal diagonal_tmp = Zero(); - scalar_object_diagonal diagonal_exp_tmp = Zero(); - scalar_object_triangle triangle_tmp = Zero(); - scalar_object_triangle triangle_exp_tmp = Zero(); - - Coordinate lcoor; - grid->LocalIndexToLocalCoor(site, lcoor); - - peekLocalSite(diagonal_tmp, diagonal_v, lcoor); - peekLocalSite(triangle_tmp, triangle_v, lcoor); - - int block; - block = 0; - for(int i = 0; i < 6; i++){ - for(int j = 0; j < 6; j++){ - if (i == j){ - srcCloverOpUL(i,j) = static_cast(TensorRemove(diagonal_tmp()(block)(i))); - } - else{ - srcCloverOpUL(i,j) = static_cast(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j))); - } - } - } - block = 1; - for(int i = 0; i < 6; i++){ - for(int j = 0; j < 6; j++){ - if (i == j){ - srcCloverOpLR(i,j) = static_cast(TensorRemove(diagonal_tmp()(block)(i))); - } - else{ - srcCloverOpLR(i,j) = static_cast(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j))); - } - } - } - - // exp(Clover) - - ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp); - - block = 0; - for(int i = 0; i < 6; i++){ - for(int j = 0; j < 6; j++){ - if (i == j){ - diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j); - } - else if(i < j){ - triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j); - } - } - } - - ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp); - - block = 1; - for(int i = 0; i < 6; i++){ - for(int j = 0; j < 6; j++){ - if (i == j){ - diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j); - } - else if(i < j){ - triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j); - } - } - } - - pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor); - pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor); - }); - } - - Diagonal *= diag_mass; - Triangle *= diag_mass; - } - - static GaugeLinkField Cmunu(std::vector &U, GaugeLinkField &lambda, int mu, int nu) { assert(0); return lambda; diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h index e4864730..e042b985 100644 --- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h @@ -305,6 +305,7 @@ void CompactWilsonCloverFermion::ImportGauge(const GaugeFie GridBase* grid = _Umu.Grid(); typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid); CloverField TmpOriginal(grid); + CloverField TmpInverse(grid); // Compute the field strength terms mu>nu double t2 = usecond(); @@ -326,14 +327,14 @@ void CompactWilsonCloverFermion::ImportGauge(const GaugeFie TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t; // Handle mass term based on clover policy CloverHelpers::MassTerm(TmpOriginal, this->diag_mass); - + // Convert the data layout of the clover term double t4 = usecond(); - CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle); + CloverHelpers::Exponentiate_Clover(TmpOriginal, TmpInverse, csw_t, this->diag_mass); // Exponentiate the clover (nothing happens in case of the standard clover) double t5 = usecond(); - CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass); + CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle); // Possible modify the boundary values double t6 = usecond(); @@ -341,7 +342,9 @@ void CompactWilsonCloverFermion::ImportGauge(const GaugeFie // Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions) double t7 = usecond(); - CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv); + //CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv); + CompactHelpers::ConvertLayout(TmpInverse, DiagonalInv, TriangleInv); + //if(open_boundaries) handle differently! // Fill the remaining clover fields double t8 = usecond(); From 513d797ea63245fdffdafd5ea1ea66bb2e357168 Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Tue, 25 Oct 2022 16:17:22 +0100 Subject: [PATCH 07/23] fix: signature of CompactWilsonCloverHelpers::Exponentiate fixed. --- Grid/qcd/action/fermion/CloverHelpers.h | 5 +---- .../CompactWilsonCloverFermionImplementation.h | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/Grid/qcd/action/fermion/CloverHelpers.h b/Grid/qcd/action/fermion/CloverHelpers.h index c1df81fb..c2bb0d8f 100644 --- a/Grid/qcd/action/fermion/CloverHelpers.h +++ b/Grid/qcd/action/fermion/CloverHelpers.h @@ -208,10 +208,7 @@ public: Clover += diag_mass; } - static void Exponentiate_Clover(CloverDiagonalField& Diagonal, - CloverTriangleField& Triangle, - RealD csw_t, RealD diag_mass) { - + static void Exponentiate_Clover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) { // Do nothing } diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h index e042b985..e45a531f 100644 --- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h @@ -365,8 +365,8 @@ void CompactWilsonCloverFermion::ImportGauge(const GaugeFie std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl; std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl; std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl; - std::cout << GridLogDebug << "convert = " << (t5 - t4) / 1e6 << std::endl; - std::cout << GridLogDebug << "exponentiation = " << (t6 - t5) / 1e6 << std::endl; + std::cout << GridLogDebug << "exponentiation = " << (t5 - t4) / 1e6 << std::endl; + std::cout << GridLogDebug << "convert = " << (t6 - t5) / 1e6 << std::endl; std::cout << GridLogDebug << "boundaries = " << (t7 - t6) / 1e6 << std::endl; std::cout << GridLogDebug << "inversions = " << (t8 - t7) / 1e6 << std::endl; std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl; From b36442e263e117ad771b4416dac469dcb6aba2fa Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Tue, 25 Oct 2022 16:57:01 +0100 Subject: [PATCH 08/23] feat: CloverHelpers::InvertClover implemented which handles the inversion of the Clover term depending on clover type and the boundary conditions. --- Grid/qcd/action/fermion/CloverHelpers.h | 31 +++++++++++++++++-- ...CompactWilsonCloverFermionImplementation.h | 6 ++-- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/Grid/qcd/action/fermion/CloverHelpers.h b/Grid/qcd/action/fermion/CloverHelpers.h index c2bb0d8f..e8c45888 100644 --- a/Grid/qcd/action/fermion/CloverHelpers.h +++ b/Grid/qcd/action/fermion/CloverHelpers.h @@ -208,10 +208,20 @@ public: Clover += diag_mass; } - static void Exponentiate_Clover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) { + static void ExponentiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) { // Do nothing } + static void InvertClover(CloverField& InvClover, + const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle, + CloverDiagonalField& diagonalInv, + CloverTriangleField& triangleInv, + bool open_boundaries) { + + CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv); + } + // TODO: implement Cmunu for better performances with compact layout, but don't do it // here, but rather in WilsonCloverHelpers.h -> CompactWilsonCloverHelpers static GaugeLinkField Cmunu(std::vector &U, GaugeLinkField &lambda, int mu, int nu) { @@ -267,7 +277,7 @@ public: static int getNMAX(Lattice> &t, RealD R) {return getNMAX(1e-12,R);} static int getNMAX(Lattice> &t, RealD R) {return getNMAX(1e-6,R);} - static void Exponentiate_Clover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) { + static void ExponentiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) { GridBase* grid = Clover.Grid(); CloverField ExpClover(grid); @@ -302,7 +312,24 @@ public: CloverInv = ExpClover * (1.0/diag_mass); + } + + static void InvertClover(CloverField& InvClover, + const CloverDiagonalField& diagonal, + const CloverTriangleField& triangle, + CloverDiagonalField& diagonalInv, + CloverTriangleField& triangleInv, + bool open_boundaries) { + + if (open_boundaries) + { + CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv); } + else + { + CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv); + } + } static GaugeLinkField Cmunu(std::vector &U, GaugeLinkField &lambda, int mu, int nu) { assert(0); diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h index e45a531f..6ee22113 100644 --- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h @@ -330,7 +330,7 @@ void CompactWilsonCloverFermion::ImportGauge(const GaugeFie // Convert the data layout of the clover term double t4 = usecond(); - CloverHelpers::Exponentiate_Clover(TmpOriginal, TmpInverse, csw_t, this->diag_mass); + CloverHelpers::ExponentiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass); // Exponentiate the clover (nothing happens in case of the standard clover) double t5 = usecond(); @@ -342,9 +342,7 @@ void CompactWilsonCloverFermion::ImportGauge(const GaugeFie // Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions) double t7 = usecond(); - //CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv); - CompactHelpers::ConvertLayout(TmpInverse, DiagonalInv, TriangleInv); - //if(open_boundaries) handle differently! + CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, open_boundaries); // Fill the remaining clover fields double t8 = usecond(); From 86075fdd45a1f88108615a6dada2df2bee5af8ad Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Tue, 25 Oct 2022 17:05:34 +0100 Subject: [PATCH 09/23] feat: MassTerm and ExponentiateClover merged into InstantiateClover --- Grid/qcd/action/fermion/CloverHelpers.h | 13 ++----------- .../CompactWilsonCloverFermionImplementation.h | 12 ++++++------ 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/Grid/qcd/action/fermion/CloverHelpers.h b/Grid/qcd/action/fermion/CloverHelpers.h index e8c45888..b8b4fa59 100644 --- a/Grid/qcd/action/fermion/CloverHelpers.h +++ b/Grid/qcd/action/fermion/CloverHelpers.h @@ -204,14 +204,10 @@ public: typedef WilsonCloverHelpers Helpers; typedef CompactWilsonCloverHelpers CompactHelpers; - static void MassTerm(CloverField& Clover, RealD diag_mass) { + static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) { Clover += diag_mass; } - static void ExponentiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) { - // Do nothing - } - static void InvertClover(CloverField& InvClover, const CloverDiagonalField& diagonal, const CloverTriangleField& triangle, @@ -244,11 +240,6 @@ public: template using iImplClover = iScalar, Ns>>; typedef CompactWilsonCloverHelpers CompactHelpers; - static void MassTerm(CloverField& Clover, RealD diag_mass) { - // do nothing! - // mass term is multiplied to exp(Clover) below - } - // Can this be avoided? static void IdentityTimesC(const CloverField& in, RealD c) { int DimRep = Impl::Dimension; @@ -277,7 +268,7 @@ public: static int getNMAX(Lattice> &t, RealD R) {return getNMAX(1e-12,R);} static int getNMAX(Lattice> &t, RealD R) {return getNMAX(1e-6,R);} - static void ExponentiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) { + static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) { GridBase* grid = Clover.Grid(); CloverField ExpClover(grid); diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h index 6ee22113..0828f5fc 100644 --- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h @@ -325,14 +325,14 @@ void CompactWilsonCloverFermion::ImportGauge(const GaugeFie TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t; TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t; TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t; - // Handle mass term based on clover policy - CloverHelpers::MassTerm(TmpOriginal, this->diag_mass); + + // Instantiate the clover term + // - In case of the standard clover the mass term is added + // - In case of the exponential clover the clover term is exponentiated + double t4 = usecond(); + CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass); // Convert the data layout of the clover term - double t4 = usecond(); - CloverHelpers::ExponentiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass); - - // Exponentiate the clover (nothing happens in case of the standard clover) double t5 = usecond(); CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle); From 9317d893b201a54c7dfa411fd0aaf6a32aa6f61b Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Tue, 25 Oct 2022 17:10:06 +0100 Subject: [PATCH 10/23] docs: details about inversion of CompactClover term added. --- .../CompactWilsonCloverFermionImplementation.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h index 0828f5fc..40bdfb43 100644 --- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h @@ -340,7 +340,10 @@ void CompactWilsonCloverFermion::ImportGauge(const GaugeFie double t6 = usecond(); if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass); - // Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions) + // Invert the Clover term + // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved + // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted. + // TODO: For now this inversion is explictly done on the CPU double t7 = usecond(); CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, open_boundaries); From a2a879b668d008ad5f70f77e7160c8e855c444a0 Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Tue, 25 Oct 2022 17:20:42 +0100 Subject: [PATCH 11/23] docs: CompactClover Debug Info improved. --- .../CompactWilsonCloverFermionImplementation.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h index 40bdfb43..16f89725 100644 --- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h @@ -336,7 +336,7 @@ void CompactWilsonCloverFermion::ImportGauge(const GaugeFie double t5 = usecond(); CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle); - // Possible modify the boundary values + // Modify the clover term at the temporal boundaries in case of open boundary conditions double t6 = usecond(); if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass); @@ -366,10 +366,10 @@ void CompactWilsonCloverFermion::ImportGauge(const GaugeFie std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl; std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl; std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl; - std::cout << GridLogDebug << "exponentiation = " << (t5 - t4) / 1e6 << std::endl; - std::cout << GridLogDebug << "convert = " << (t6 - t5) / 1e6 << std::endl; - std::cout << GridLogDebug << "boundaries = " << (t7 - t6) / 1e6 << std::endl; - std::cout << GridLogDebug << "inversions = " << (t8 - t7) / 1e6 << std::endl; + std::cout << GridLogDebug << "exponentiate clover = " << (t5 - t4) / 1e6 << std::endl; + std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl; + std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl; + std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl; std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl; std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl; } From 5fa6a8b96d6d7d82dfb2ab2c0ebfb844871bcc6b Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Wed, 26 Oct 2022 12:40:28 +0100 Subject: [PATCH 12/23] docs: CompactClover debug info generalized. --- .../implementation/CompactWilsonCloverFermionImplementation.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h index 16f89725..c8d1ca30 100644 --- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h @@ -366,7 +366,7 @@ void CompactWilsonCloverFermion::ImportGauge(const GaugeFie std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl; std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl; std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl; - std::cout << GridLogDebug << "exponentiate clover = " << (t5 - t4) / 1e6 << std::endl; + std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl; std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl; std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl; std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl; From 184adeedb83c716b879d2026d1af902aa2457d02 Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Wed, 26 Oct 2022 12:53:46 +0100 Subject: [PATCH 13/23] feat: renamed open_boundaries to fixedBoundaries --- Grid/qcd/action/fermion/CloverHelpers.h | 6 ++-- .../fermion/CompactWilsonCloverFermion.h | 2 +- ...CompactWilsonCloverFermionImplementation.h | 32 +++++++++---------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Grid/qcd/action/fermion/CloverHelpers.h b/Grid/qcd/action/fermion/CloverHelpers.h index b8b4fa59..cd469ea7 100644 --- a/Grid/qcd/action/fermion/CloverHelpers.h +++ b/Grid/qcd/action/fermion/CloverHelpers.h @@ -213,7 +213,7 @@ public: const CloverTriangleField& triangle, CloverDiagonalField& diagonalInv, CloverTriangleField& triangleInv, - bool open_boundaries) { + bool fixedBoundaries) { CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv); } @@ -310,9 +310,9 @@ public: const CloverTriangleField& triangle, CloverDiagonalField& diagonalInv, CloverTriangleField& triangleInv, - bool open_boundaries) { + bool fixedBoundaries) { - if (open_boundaries) + if (fixedBoundaries) { CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv); } diff --git a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h index 249b20bd..d79b34d4 100644 --- a/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h +++ b/Grid/qcd/action/fermion/CompactWilsonCloverFermion.h @@ -225,7 +225,7 @@ public: RealD csw_t; RealD cF; - bool open_boundaries; + bool fixedBoundaries; CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd; CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd; diff --git a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h index c8d1ca30..7e3b7f00 100644 --- a/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h +++ b/Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermionImplementation.h @@ -48,7 +48,7 @@ CompactWilsonCloverFermion::CompactWilsonCloverFermion(Gaug , csw_r(_csw_r) , csw_t(_csw_t) , cF(_cF) - , open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0) + , fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0) , Diagonal(&Fgrid), Triangle(&Fgrid) , DiagonalEven(&Hgrid), TriangleEven(&Hgrid) , DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid) @@ -67,7 +67,7 @@ CompactWilsonCloverFermion::CompactWilsonCloverFermion(Gaug csw_r /= clover_anisotropy.xi_0; ImportGauge(_Umu); - if (open_boundaries) { + if (fixedBoundaries) { this->BoundaryMaskEven.Checkerboard() = Even; this->BoundaryMaskOdd.Checkerboard() = Odd; CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd); @@ -77,31 +77,31 @@ CompactWilsonCloverFermion::CompactWilsonCloverFermion(Gaug template void CompactWilsonCloverFermion::Dhop(const FermionField& in, FermionField& out, int dag) { WilsonBase::Dhop(in, out, dag); - if(open_boundaries) ApplyBoundaryMask(out); + if(fixedBoundaries) ApplyBoundaryMask(out); } template void CompactWilsonCloverFermion::DhopOE(const FermionField& in, FermionField& out, int dag) { WilsonBase::DhopOE(in, out, dag); - if(open_boundaries) ApplyBoundaryMask(out); + if(fixedBoundaries) ApplyBoundaryMask(out); } template void CompactWilsonCloverFermion::DhopEO(const FermionField& in, FermionField& out, int dag) { WilsonBase::DhopEO(in, out, dag); - if(open_boundaries) ApplyBoundaryMask(out); + if(fixedBoundaries) ApplyBoundaryMask(out); } template void CompactWilsonCloverFermion::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) { WilsonBase::DhopDir(in, out, dir, disp); - if(this->open_boundaries) ApplyBoundaryMask(out); + if(this->fixedBoundaries) ApplyBoundaryMask(out); } template void CompactWilsonCloverFermion::DhopDirAll(const FermionField& in, std::vector& out) { WilsonBase::DhopDirAll(in, out); - if(this->open_boundaries) { + if(this->fixedBoundaries) { for(auto& o : out) ApplyBoundaryMask(o); } } @@ -112,7 +112,7 @@ void CompactWilsonCloverFermion::M(const FermionField& in, WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc Mooee(in, Tmp); axpy(out, 1.0, out, Tmp); - if(open_boundaries) ApplyBoundaryMask(out); + if(fixedBoundaries) ApplyBoundaryMask(out); } template @@ -121,19 +121,19 @@ void CompactWilsonCloverFermion::Mdag(const FermionField& i WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc MooeeDag(in, Tmp); axpy(out, 1.0, out, Tmp); - if(open_boundaries) ApplyBoundaryMask(out); + if(fixedBoundaries) ApplyBoundaryMask(out); } template void CompactWilsonCloverFermion::Meooe(const FermionField& in, FermionField& out) { WilsonBase::Meooe(in, out); - if(open_boundaries) ApplyBoundaryMask(out); + if(fixedBoundaries) ApplyBoundaryMask(out); } template void CompactWilsonCloverFermion::MeooeDag(const FermionField& in, FermionField& out) { WilsonBase::MeooeDag(in, out); - if(open_boundaries) ApplyBoundaryMask(out); + if(fixedBoundaries) ApplyBoundaryMask(out); } template @@ -147,7 +147,7 @@ void CompactWilsonCloverFermion::Mooee(const FermionField& } else { MooeeInternal(in, out, Diagonal, Triangle); } - if(open_boundaries) ApplyBoundaryMask(out); + if(fixedBoundaries) ApplyBoundaryMask(out); } template @@ -166,7 +166,7 @@ void CompactWilsonCloverFermion::MooeeInv(const FermionFiel } else { MooeeInternal(in, out, DiagonalInv, TriangleInv); } - if(open_boundaries) ApplyBoundaryMask(out); + if(fixedBoundaries) ApplyBoundaryMask(out); } template @@ -186,7 +186,7 @@ void CompactWilsonCloverFermion::MdirAll(const FermionField template void CompactWilsonCloverFermion::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) { - assert(!open_boundaries); // TODO check for changes required for open bc + assert(!fixedBoundaries); // TODO check for changes required for open bc // NOTE: code copied from original clover term conformable(X.Grid(), Y.Grid()); @@ -338,14 +338,14 @@ void CompactWilsonCloverFermion::ImportGauge(const GaugeFie // Modify the clover term at the temporal boundaries in case of open boundary conditions double t6 = usecond(); - if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass); + if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass); // Invert the Clover term // In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved // in TmpInverse can be used. In all other cases the clover term has to be explictly inverted. // TODO: For now this inversion is explictly done on the CPU double t7 = usecond(); - CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, open_boundaries); + CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries); // Fill the remaining clover fields double t8 = usecond(); From 82e959f66cac74c3863eaa0b42f3c8b42ae874d8 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 8 Nov 2022 12:45:25 -0800 Subject: [PATCH 14/23] SYCL reduction --- Grid/lattice/Lattice_reduction.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h index 0ddac437..fb6a258c 100644 --- a/Grid/lattice/Lattice_reduction.h +++ b/Grid/lattice/Lattice_reduction.h @@ -28,6 +28,9 @@ Author: Christoph Lehner #if defined(GRID_CUDA)||defined(GRID_HIP) #include #endif +#if defined(GRID_SYCL) +#include +#endif NAMESPACE_BEGIN(Grid); @@ -127,7 +130,7 @@ inline Double max(const Double *arg, Integer osites) template inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) { -#if defined(GRID_CUDA)||defined(GRID_HIP) +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) return sum_gpu(arg,osites); #else return sum_cpu(arg,osites); @@ -136,7 +139,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) template inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) { -#if defined(GRID_CUDA)||defined(GRID_HIP) +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) return sumD_gpu(arg,osites); #else return sumD_cpu(arg,osites); @@ -145,7 +148,7 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites) template inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) { -#if defined(GRID_CUDA)||defined(GRID_HIP) +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) return sumD_gpu_large(arg,osites); #else return sumD_cpu(arg,osites); @@ -155,13 +158,13 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites) template inline typename vobj::scalar_object sum(const Lattice &arg) { -#if defined(GRID_CUDA)||defined(GRID_HIP) - autoView( arg_v, arg, AcceleratorRead); Integer osites = arg.Grid()->oSites(); - auto ssum= sum_gpu(&arg_v[0],osites); +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) + typename vobj::scalar_object ssum; + autoView( arg_v, arg, AcceleratorRead); + ssum= sum_gpu(&arg_v[0],osites); #else autoView(arg_v, arg, CpuRead); - Integer osites = arg.Grid()->oSites(); auto ssum= sum_cpu(&arg_v[0],osites); #endif arg.Grid()->GlobalSum(ssum); @@ -171,7 +174,7 @@ inline typename vobj::scalar_object sum(const Lattice &arg) template inline typename vobj::scalar_object sum_large(const Lattice &arg) { -#if defined(GRID_CUDA)||defined(GRID_HIP) +#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) autoView( arg_v, arg, AcceleratorRead); Integer osites = arg.Grid()->oSites(); auto ssum= sum_gpu_large(&arg_v[0],osites); @@ -235,11 +238,10 @@ inline ComplexD rankInnerProduct(const Lattice &left,const Lattice & typedef decltype(innerProductD(vobj(),vobj())) inner_t; Vector inner_tmp(sites); auto inner_tmp_v = &inner_tmp[0]; - { autoView( left_v , left, AcceleratorRead); autoView( right_v,right, AcceleratorRead); - + // This code could read coalesce // GPU - SIMT lane compliance... accelerator_for( ss, sites, 1,{ auto x_l = left_v[ss]; From bd68861b28b24ce85adcbcbe5a7c50b383446b3f Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 8 Nov 2022 12:49:26 -0800 Subject: [PATCH 15/23] SYCL sum --- Grid/lattice/Lattice_reduction_sycl.h | 125 ++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 Grid/lattice/Lattice_reduction_sycl.h diff --git a/Grid/lattice/Lattice_reduction_sycl.h b/Grid/lattice/Lattice_reduction_sycl.h new file mode 100644 index 00000000..90980c4c --- /dev/null +++ b/Grid/lattice/Lattice_reduction_sycl.h @@ -0,0 +1,125 @@ +NAMESPACE_BEGIN(Grid); + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +// Possibly promote to double and sum +///////////////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_objectD sobjD; + sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator); + sobj identity; zeroit(identity); + sobj ret ; + + Integer nsimd= vobj::Nsimd(); + + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { + auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>()); + cgh.parallel_for(cl::sycl::range<1>{osites}, + Reduction, + [=] (cl::sycl::id<1> item, auto &sum) { + auto osite = item[0]; + sum +=Reduce(lat[osite]); + }); + }); + theGridAccelerator->wait(); + ret = mysum[0]; + free(mysum,*theGridAccelerator); + sobjD dret; convertType(dret,ret); + return dret; +} + +template +inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites) +{ + return sumD_gpu_tensor(lat,osites); +} +template +inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites) +{ + return sumD_gpu_large(lat,osites); +} + +template +inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) +{ + return sumD_gpu_large(lat,osites); +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +// Return as same precision as input performing reduction in double precision though +///////////////////////////////////////////////////////////////////////////////////////////////////////// +template +inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites) +{ + typedef typename vobj::scalar_object sobj; + sobj result; + result = sumD_gpu(lat,osites); + return result; +} + +template +inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites) +{ + typedef typename vobj::scalar_object sobj; + sobj result; + result = sumD_gpu_large(lat,osites); + return result; +} + +NAMESPACE_END(Grid); + +/* +template Double svm_reduce(Double *vec,uint64_t L) +{ + Double sumResult; zeroit(sumResult); + Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator); + Double identity; zeroit(identity); + theGridAccelerator->submit([&](cl::sycl::handler &cgh) { + auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>()); + cgh.parallel_for(cl::sycl::range<1>{L}, + Reduction, + [=] (cl::sycl::id<1> index, auto &sum) { + sum +=vec[index]; + }); + }); + theGridAccelerator->wait(); + Double ret = d_sum[0]; + free(d_sum,*theGridAccelerator); + std::cout << " svm_reduce finished "< Date: Tue, 8 Nov 2022 13:22:45 -0800 Subject: [PATCH 17/23] PVC --- systems/PVC/benchmarks/run-1tile.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) mode change 100644 => 100755 systems/PVC/benchmarks/run-1tile.sh diff --git a/systems/PVC/benchmarks/run-1tile.sh b/systems/PVC/benchmarks/run-1tile.sh old mode 100644 new mode 100755 index 923afd84..0fe80247 --- a/systems/PVC/benchmarks/run-1tile.sh +++ b/systems/PVC/benchmarks/run-1tile.sh @@ -6,7 +6,7 @@ source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh -export NT=16 +export NT=8 export I_MPI_OFFLOAD=1 export I_MPI_OFFLOAD_TOPOLIB=level_zero @@ -21,7 +21,7 @@ export I_MPI_OFFLOAD_CELL=tile export EnableImplicitScaling=0 export EnableWalkerPartition=0 export ZE_AFFINITY_MASK=0.0 -mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8 +mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --device-mem 32768 export ZE_AFFINITY_MASK=0 export I_MPI_OFFLOAD_CELL=device From 5c75aa50089bc53bc8a2a18dd16e8d6aab5041c4 Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 8 Nov 2022 13:22:57 -0800 Subject: [PATCH 18/23] Device mem --- systems/PVC/benchmarks/run-2tile-mpi.sh | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/systems/PVC/benchmarks/run-2tile-mpi.sh b/systems/PVC/benchmarks/run-2tile-mpi.sh index 34deac2c..cefab776 100755 --- a/systems/PVC/benchmarks/run-2tile-mpi.sh +++ b/systems/PVC/benchmarks/run-2tile-mpi.sh @@ -8,7 +8,6 @@ source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh export NT=16 - # export IGC_EnableLSCFenceUGMBeforeEOT=0 # export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False" #export IGC_ShaderDumpEnable=1 @@ -20,14 +19,16 @@ export SYCL_DEVICE_FILTER=gpu,level_zero export I_MPI_OFFLOAD_CELL=tile export EnableImplicitScaling=0 export EnableWalkerPartition=0 -#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 +export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 -#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0 +export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0 -mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > dw.2tile.1x2.log -mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > dw.2tile.2x1.log - -mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.1x2.log -mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.2x1.log +for i in 0 +do +mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768 +mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768 +done +#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.1x2.log +#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.2x1.log From 0655dab46659e21dfb7eaa4a4c947296e315fa5e Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 8 Nov 2022 13:38:54 -0800 Subject: [PATCH 19/23] Open MP on host enabled --- systems/PVC/config-command | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systems/PVC/config-command b/systems/PVC/config-command index 858edda7..cd7bba1d 100644 --- a/systems/PVC/config-command +++ b/systems/PVC/config-command @@ -12,5 +12,5 @@ INSTALL=/nfs/site/home/azusayax/install MPICXX=mpicxx \ CXX=dpcpp \ LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \ - CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare" + CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare" From e13930c8b228380b6556541c334a0ae40a97afae Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 30 Nov 2022 15:11:29 -0500 Subject: [PATCH 20/23] Faster fermtoprop case --- Grid/qcd/QCD.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h index 858aead7..b292e560 100644 --- a/Grid/qcd/QCD.h +++ b/Grid/qcd/QCD.h @@ -451,9 +451,20 @@ template void pokeLorentz(vobj &lhs,const decltype(peekIndex propagator assignements ////////////////////////////////////////////// //template +#define FAST_FERM_TO_PROP template void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c) { +#ifdef FAST_FERM_TO_PROP + autoView(p_v,p,AcceleratorWrite); + autoView(f_v,f,AcceleratorRead); + accelerator_for(idx,p_v.oSites(),1,{ + for(int ss = 0; ss < Ns; ++ss) { + for(int cc = 0; cc < Fimpl::Dimension; ++cc) { + p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.) + }} + ); +#else for(int j = 0; j < Ns; ++j) { auto pjs = peekSpin(p, j, s); @@ -465,12 +476,23 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio } pokeSpin(p, pjs, j, s); } +#endif } //template template void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c) { +#ifdef FAST_FERM_TO_PROP + autoView(p_v,p,AcceleratorWrite); + autoView(f_v,f,AcceleratorRead); + accelerator_for(idx,p_v.oSites(),1,{ + for(int ss = 0; ss < Ns; ++ss) { + for(int cc = 0; cc < Fimpl::Dimension; ++cc) { + f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index + }} + ); +#else for(int j = 0; j < Ns; ++j) { auto pjs = peekSpin(p, j, s); @@ -482,6 +504,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato } pokeSpin(f, fj, j); } +#endif } ////////////////////////////////////////////// From 97a098636d816970631b120877b85e13691dd8ff Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Wed, 30 Nov 2022 15:36:35 -0500 Subject: [PATCH 21/23] FermToProp --- Grid/qcd/QCD.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h index b292e560..9c56c12c 100644 --- a/Grid/qcd/QCD.h +++ b/Grid/qcd/QCD.h @@ -463,7 +463,7 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio for(int cc = 0; cc < Fimpl::Dimension; ++cc) { p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.) }} - ); + }); #else for(int j = 0; j < Ns; ++j) { @@ -491,7 +491,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato for(int cc = 0; cc < Fimpl::Dimension; ++cc) { f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index }} - ); + }); #else for(int j = 0; j < Ns; ++j) { From d49694f38f1b07676c4200892922c9a89bca99ad Mon Sep 17 00:00:00 2001 From: Raoul Hodgson Date: Tue, 6 Dec 2022 15:48:54 +0000 Subject: [PATCH 22/23] PropToFerm fix --- Grid/qcd/QCD.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h index 9c56c12c..39f45ada 100644 --- a/Grid/qcd/QCD.h +++ b/Grid/qcd/QCD.h @@ -484,8 +484,8 @@ template void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c) { #ifdef FAST_FERM_TO_PROP - autoView(p_v,p,AcceleratorWrite); - autoView(f_v,f,AcceleratorRead); + autoView(p_v,p,AcceleratorRead); + autoView(f_v,f,AcceleratorWrite); accelerator_for(idx,p_v.oSites(),1,{ for(int ss = 0; ss < Ns; ++ss) { for(int cc = 0; cc < Fimpl::Dimension; ++cc) { From 40234f531ff83172cebbb7401916641d0df7ddd2 Mon Sep 17 00:00:00 2001 From: Raoul Hodgson Date: Tue, 6 Dec 2022 17:34:51 +0000 Subject: [PATCH 23/23] FermToProp accelerator_for -> thread_for --- Grid/qcd/QCD.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Grid/qcd/QCD.h b/Grid/qcd/QCD.h index 39f45ada..6dd6d5dc 100644 --- a/Grid/qcd/QCD.h +++ b/Grid/qcd/QCD.h @@ -456,9 +456,9 @@ template void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c) { #ifdef FAST_FERM_TO_PROP - autoView(p_v,p,AcceleratorWrite); - autoView(f_v,f,AcceleratorRead); - accelerator_for(idx,p_v.oSites(),1,{ + autoView(p_v,p,CpuWrite); + autoView(f_v,f,CpuRead); + thread_for(idx,p_v.oSites(),{ for(int ss = 0; ss < Ns; ++ss) { for(int cc = 0; cc < Fimpl::Dimension; ++cc) { p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.) @@ -484,9 +484,9 @@ template void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c) { #ifdef FAST_FERM_TO_PROP - autoView(p_v,p,AcceleratorRead); - autoView(f_v,f,AcceleratorWrite); - accelerator_for(idx,p_v.oSites(),1,{ + autoView(p_v,p,CpuRead); + autoView(f_v,f,CpuWrite); + thread_for(idx,p_v.oSites(),{ for(int ss = 0; ss < Ns; ++ss) { for(int cc = 0; cc < Fimpl::Dimension; ++cc) { f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index