diff --git a/Grid/lattice/Lattice.h b/Grid/lattice/Lattice.h index c4adf86a..6343db99 100644 --- a/Grid/lattice/Lattice.h +++ b/Grid/lattice/Lattice.h @@ -47,3 +47,4 @@ Author: Peter Boyle #include #include #include +#include diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index 04540d88..668ef4b4 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -697,8 +697,68 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro for(int d=0;d_processors[d] == Tg->_processors[d]); } - // the above should guarantee that the operations are local + +#if 1 + + size_t nsite = 1; + for(int i=0;ioIndex(from_coor); + int fiidx = Fg->iIndex(from_coor); + int toidx = Tg->oIndex(to_coor); + int tiidx = Tg->iIndex(to_coor); + int* tt = table + 4*idx; + tt[0] = foidx; + tt[1] = fiidx; + tt[2] = toidx; + tt[3] = tiidx; + }); + + int* table_d = (int*)acceleratorAllocDevice(tbytes); + acceleratorCopyToDevice(table,table_d,tbytes); + + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + autoView(from_v,From,AcceleratorRead); + autoView(to_v,To,AcceleratorWrite); + + accelerator_for(idx,nsite,1,{ + static const int words=sizeof(vobj)/sizeof(vector_type); + int* tt = table_d + 4*idx; + int from_oidx = *tt++; + int from_lane = *tt++; + int to_oidx = *tt++; + int to_lane = *tt; + + const vector_type* from = (const vector_type *)&from_v[from_oidx]; + vector_type* to = (vector_type *)&to_v[to_oidx]; + + scalar_type stmp; + for(int w=0;w_ldimensions; Coordinate rdf = Fg->_rdimensions; Coordinate isf = Fg->_istride; @@ -738,6 +798,8 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro #endif } }); + +#endif } @@ -830,6 +892,8 @@ void ExtractSlice(Lattice &lowDim,const Lattice & higherDim,int slic } +//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim +//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same template void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int slice_lo,int slice_hi, int orthog) { @@ -851,6 +915,65 @@ void InsertSliceLocal(const Lattice &lowDim, Lattice & higherDim,int } } +#if 1 + size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog]; + size_t tbytes = 4*nsite*sizeof(int); + int *table = (int*)malloc(tbytes); + + thread_for(idx,nsite,{ + Coordinate lcoor(nl); + Coordinate hcoor(nh); + lcoor[orthog] = slice_lo; + hcoor[orthog] = slice_hi; + size_t rem = idx; + for(int mu=0;muLocalDimensions()[mu]; rem /= lg->LocalDimensions()[mu]; + lcoor[mu] = hcoor[mu] = xmu; + } + } + int loidx = lg->oIndex(lcoor); + int liidx = lg->iIndex(lcoor); + int hoidx = hg->oIndex(hcoor); + int hiidx = hg->iIndex(hcoor); + int* tt = table + 4*idx; + tt[0] = loidx; + tt[1] = liidx; + tt[2] = hoidx; + tt[3] = hiidx; + }); + + int* table_d = (int*)acceleratorAllocDevice(tbytes); + acceleratorCopyToDevice(table,table_d,tbytes); + + typedef typename vobj::vector_type vector_type; + typedef typename vobj::scalar_type scalar_type; + + autoView(lowDim_v,lowDim,AcceleratorRead); + autoView(higherDim_v,higherDim,AcceleratorWrite); + + accelerator_for(idx,nsite,1,{ + static const int words=sizeof(vobj)/sizeof(vector_type); + int* tt = table_d + 4*idx; + int from_oidx = *tt++; + int from_lane = *tt++; + int to_oidx = *tt++; + int to_lane = *tt; + + const vector_type* from = (const vector_type *)&lowDim_v[from_oidx]; + vector_type* to = (vector_type *)&higherDim_v[to_oidx]; + + scalar_type stmp; + for(int w=0;w &lowDim, Lattice & higherDim,int pokeLocalSite(s,higherDimv,hcoor); } }); +#endif } diff --git a/Grid/lattice/PaddedCell.h b/Grid/lattice/PaddedCell.h index 863d361a..5dfb6b0f 100644 --- a/Grid/lattice/PaddedCell.h +++ b/Grid/lattice/PaddedCell.h @@ -26,14 +26,32 @@ Author: Peter Boyle pboyle@bnl.gov /* END LEGAL */ #pragma once +#include + NAMESPACE_BEGIN(Grid); +//Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions +template +struct CshiftImplBase{ + virtual Lattice Cshift(const Lattice &in, int dir, int shift) const = 0; + virtual ~CshiftImplBase(){} +}; +template +struct CshiftImplDefault: public CshiftImplBase{ + Lattice Cshift(const Lattice &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); } +}; +template +struct CshiftImplGauge: public CshiftImplBase{ + typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); } +}; + class PaddedCell { public: GridCartesian * unpadded_grid; int dims; int depth; std::vector grids; + ~PaddedCell() { DeleteGrids(); @@ -77,7 +95,7 @@ public: } }; template - inline Lattice Extract(Lattice &in) + inline Lattice Extract(const Lattice &in) const { Lattice out(unpadded_grid); @@ -88,19 +106,19 @@ public: return out; } template - inline Lattice Exchange(Lattice &in) + inline Lattice Exchange(const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) const { GridBase *old_grid = in.Grid(); int dims = old_grid->Nd(); Lattice tmp = in; for(int d=0;d - inline Lattice Expand(int dim,Lattice &in) + inline Lattice Expand(int dim, const Lattice &in, const CshiftImplBase &cshift = CshiftImplDefault()) const { GridBase *old_grid = in.Grid(); GridCartesian *new_grid = grids[dim];//These are new grids @@ -112,20 +130,40 @@ public: else conformable(old_grid,grids[dim-1]); std::cout << " dim "< &conjDirs) { _conjDirs=conjDirs; } + static inline void setDirections(const std::vector &conjDirs) { _conjDirs=conjDirs; } static inline std::vector getDirections(void) { return _conjDirs; } static inline bool isPeriodicGaugeField(void) { return false; } }; diff --git a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h index 7690092d..b9d6ac16 100644 --- a/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h +++ b/Grid/qcd/action/gauge/PlaqPlusRectangleAction.h @@ -43,7 +43,7 @@ public: private: RealD c_plaq; RealD c_rect; - + typename WilsonLoops::StapleAndRectStapleAllWorkspace workspace; public: PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){}; @@ -79,27 +79,18 @@ public: GridBase *grid = Umu.Grid(); std::vector U (Nd,grid); - std::vector U2(Nd,grid); - for(int mu=0;mu(Umu,mu); - WilsonLoops::RectStapleDouble(U2[mu],U[mu],mu); } + std::vector RectStaple(Nd,grid), Staple(Nd,grid); + WilsonLoops::StapleAndRectStapleAll(Staple, RectStaple, U, workspace); GaugeLinkField dSdU_mu(grid); GaugeLinkField staple(grid); for (int mu=0; mu < Nd; mu++){ - - // Staple in direction mu - - WilsonLoops::Staple(staple,Umu,mu); - - dSdU_mu = Ta(U[mu]*staple)*factor_p; - - WilsonLoops::RectStaple(Umu,staple,U2,U,mu); - - dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r; + dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p; + dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r; PokeIndex(dSdU, dSdU_mu, mu); } diff --git a/Grid/qcd/utils/CovariantCshift.h b/Grid/qcd/utils/CovariantCshift.h index 23984145..583c0f2b 100644 --- a/Grid/qcd/utils/CovariantCshift.h +++ b/Grid/qcd/utils/CovariantCshift.h @@ -37,13 +37,14 @@ NAMESPACE_BEGIN(Grid); // Make these members of an Impl class for BC's. namespace PeriodicBC { - + //Out(x) = Link(x)*field(x+mu) template Lattice CovShiftForward(const Lattice &Link, int mu, const Lattice &field) { return Link*Cshift(field,mu,1);// moves towards negative mu } + //Out(x) = Link^dag(x-mu)*field(x-mu) template Lattice CovShiftBackward(const Lattice &Link, int mu, const Lattice &field) @@ -52,19 +53,19 @@ namespace PeriodicBC { tmp = adj(Link)*field; return Cshift(tmp,mu,-1);// moves towards positive mu } - + //Out(x) = Link^dag(x-mu) template Lattice CovShiftIdentityBackward(const Lattice &Link, int mu) { return Cshift(adj(Link), mu, -1); } - + //Out(x) = Link(x) template Lattice CovShiftIdentityForward(const Lattice &Link, int mu) { return Link; } - + //Link(x) = Link(x+mu) template Lattice ShiftStaple(const Lattice &Link, int mu) { diff --git a/Grid/qcd/utils/WilsonLoops.h b/Grid/qcd/utils/WilsonLoops.h index d6efbd5d..78e25a8d 100644 --- a/Grid/qcd/utils/WilsonLoops.h +++ b/Grid/qcd/utils/WilsonLoops.h @@ -290,7 +290,7 @@ public: } */ ////////////////////////////////////////////////// - // the sum over all staples on each site + // the sum over all nu-oriented staples for nu != mu on each site ////////////////////////////////////////////////// static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) { @@ -300,6 +300,10 @@ public: for (int d = 0; d < Nd; d++) { U[d] = PeekIndex(Umu, d); } + Staple(staple, U, mu); + } + + static void Staple(GaugeMat &staple, const std::vector &U, int mu) { staple = Zero(); for (int nu = 0; nu < Nd; nu++) { @@ -335,6 +339,202 @@ public: } } + ///////////// + //Staples for each direction mu, summed over nu != mu + //staple: output staples for each mu (Nd) + //U: link array (Nd) + ///////////// + static void StapleAll(std::vector &staple, const std::vector &U) { + assert(staple.size() == Nd); assert(U.size() == Nd); + for(int mu=0;mu stencil; + size_t nshift; + + void generateStencil(GridBase* padded_grid){ + double t0 = usecond(); + + //Generate shift arrays + std::vector shifts = this->getShifts(); + nshift = shifts.size(); + + double t1 = usecond(); + //Generate local stencil + stencil.reset(new GeneralLocalStencil(padded_grid,shifts)); + double t2 = usecond(); + std::cout << GridLogPerformance << " WilsonLoopPaddedWorkspace timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms" << std::endl; + } + public: + //Get the stencil. If not already generated, or if generated using a different Grid than in PaddedCell, it will be created on-the-fly + const GeneralLocalStencil & getStencil(const PaddedCell &pcell){ + assert(pcell.depth >= this->paddingDepth()); + if(!stencil || stencil->Grid() != (GridBase*)pcell.grids.back() ) generateStencil((GridBase*)pcell.grids.back()); + return *stencil; + } + size_t Nshift() const{ return nshift; } + + virtual std::vector getShifts() const = 0; + virtual int paddingDepth() const = 0; //padding depth required + + virtual ~WilsonLoopPaddedStencilWorkspace(){} + }; + + //This workspace allows the sharing of a common PaddedCell object between multiple stencil workspaces + class WilsonLoopPaddedWorkspace{ + std::vector stencil_wk; + std::unique_ptr pcell; + + void generatePcell(GridBase* unpadded_grid){ + assert(stencil_wk.size()); + int max_depth = 0; + for(auto const &s : stencil_wk) max_depth=std::max(max_depth, s->paddingDepth()); + + pcell.reset(new PaddedCell(max_depth, dynamic_cast(unpadded_grid))); + } + + public: + //Add a stencil definition. This should be done before the first call to retrieve a stencil object. + //Takes ownership of the pointer + void addStencil(WilsonLoopPaddedStencilWorkspace *stencil){ + assert(!pcell); + stencil_wk.push_back(stencil); + } + + const GeneralLocalStencil & getStencil(const size_t stencil_idx, GridBase* unpadded_grid){ + if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid); + return stencil_wk[stencil_idx]->getStencil(*pcell); + } + const PaddedCell & getPaddedCell(GridBase* unpadded_grid){ + if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid); + return *pcell; + } + + ~WilsonLoopPaddedWorkspace(){ + for(auto &s : stencil_wk) delete s; + } + }; + + //A workspace class allowing reuse of the stencil + class StaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{ + public: + std::vector getShifts() const override{ + std::vector shifts; + for(int mu=0;mu &staple, const std::vector &U_padded, const PaddedCell &Cell) { + StaplePaddedAllWorkspace wk; + StaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell)); + } + + //Padded cell implementation of the staple method for all mu, summed over nu != mu + //staple: output staple for each mu, summed over nu != mu (Nd) + //U_padded: the gauge link fields padded out using the PaddedCell class + //Cell: the padded cell class + //gStencil: the precomputed generalized local stencil for the staple + static void StaplePaddedAll(std::vector &staple, const std::vector &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) { + double t0 = usecond(); + assert(U_padded.size() == Nd); assert(staple.size() == Nd); + assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back()); + assert(Cell.depth >= 1); + GridBase *ggrid = U_padded[0].Grid(); //padded cell grid + + int shift_mu_off = gStencil._npoints/Nd; + + //Open views to padded gauge links and keep open over mu loop + typedef LatticeView GaugeViewType; + size_t vsize = Nd*sizeof(GaugeViewType); + GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize); + for(int i=0;ioSites(), ggrid->Nsimd(), { + decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; + stencil_ss = Zero(); + int off = outer_off; + + for(int nu=0;nu_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(off++,ss); + auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(off++,ss); + auto U2 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U2 * U1 * U0; + + e = gStencil_v.GetEntry(off++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(off++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(off++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + + stencil_ss = stencil_ss + U2 * U1 * U0; + } + } + + coalescedWrite(gStaple_v[ss],stencil_ss); + } + ); + } //ensure views are all closed! + + staple[mu] = Cell.Extract(gStaple); + outer_off += shift_mu_off; + }//mu loop + + for(int i=0;i &U2, - std::vector &U, int mu) { + static void RectStapleOptimised(GaugeMat &Stap, const std::vector &U2, + const std::vector &U, int mu) { Stap = Zero(); @@ -732,9 +928,9 @@ public: // Up staple ___ ___ // | | - tmp = Cshift(adj(U[nu]), nu, -1); + tmp = Gimpl::CshiftLink(adj(U[nu]), nu, -1); tmp = adj(U2[mu]) * tmp; - tmp = Cshift(tmp, mu, -2); + tmp = Gimpl::CshiftLink(tmp, mu, -2); Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp); @@ -742,14 +938,14 @@ public: // |___ ___| // tmp = adj(U2[mu]) * U[nu]; - Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Cshift(tmp, mu, -2)); + Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Gimpl::CshiftLink(tmp, mu, -2)); // ___ ___ // | ___| // |___ ___| // - Stap += Cshift(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1); + Stap += Gimpl::CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1); // ___ ___ // |___ | @@ -758,7 +954,7 @@ public: // tmp= Staple2x1* Cshift(U[mu],mu,-2); // Stap+= Cshift(tmp,mu,1) ; - Stap += Cshift(Staple2x1, mu, 1) * Cshift(U[mu], mu, -1); + Stap += Gimpl::CshiftLink(Staple2x1, mu, 1) * Gimpl::CshiftLink(U[mu], mu, -1); ; // -- @@ -766,10 +962,10 @@ public: // // | | - tmp = Cshift(adj(U2[nu]), nu, -2); + tmp = Gimpl::CshiftLink(adj(U2[nu]), nu, -2); tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp); - tmp = U2[nu] * Cshift(tmp, nu, 2); - Stap += Cshift(tmp, mu, 1); + tmp = U2[nu] * Gimpl::CshiftLink(tmp, nu, 2); + Stap += Gimpl::CshiftLink(tmp, mu, 1); // | | // @@ -778,25 +974,12 @@ public: tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]); tmp = adj(U2[nu]) * tmp; - tmp = Cshift(tmp, nu, -2); - Stap += Cshift(tmp, mu, 1); + tmp = Gimpl::CshiftLink(tmp, nu, -2); + Stap += Gimpl::CshiftLink(tmp, mu, 1); } } } - static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) { - RectStapleUnoptimised(Stap, Umu, mu); - } - static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap, - std::vector &U2, std::vector &U, - int mu) { - if (Gimpl::isPeriodicGaugeField()) { - RectStapleOptimised(Stap, U2, U, mu); - } else { - RectStapleUnoptimised(Stap, Umu, mu); - } - } - static void RectStapleUnoptimised(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) { GridBase *grid = Umu.Grid(); @@ -895,6 +1078,288 @@ public: } } + static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) { + RectStapleUnoptimised(Stap, Umu, mu); + } + static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap, + std::vector &U2, std::vector &U, + int mu) { + RectStapleOptimised(Stap, U2, U, mu); + } + ////////////////////////////////////////////////////// + //Compute the rectangular staples for all orientations + //Stap : Array of staples (Nd) + //U: Gauge links in each direction (Nd) + ///////////////////////////////////////////////////// + static void RectStapleAll(std::vector &Stap, const std::vector &U){ + assert(Stap.size() == Nd); assert(U.size() == Nd); + std::vector U2(Nd,U[0].Grid()); + for(int mu=0;mu getShifts() const override{ + std::vector shifts; + for (int mu = 0; mu < Nd; mu++){ + for (int nu = 0; nu < Nd; nu++) { + if (nu != mu) { + auto genShift = [&](int mushift,int nushift){ + Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out; + }; + + //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x) + shifts.push_back(genShift(0,0)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(+1,+1)); + shifts.push_back(genShift(+2,0)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(+1,-1)); + shifts.push_back(genShift(+2,-1)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,-1)); + shifts.push_back(genShift(-1,-1)); + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(+1,-1)); + + //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,+1)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(+1,0)); + + //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) + shifts.push_back(genShift(0,0)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(0,+2)); + shifts.push_back(genShift(+1,+1)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(0,-2)); + shifts.push_back(genShift(0,-2)); + shifts.push_back(genShift(+1,-2)); + shifts.push_back(genShift(+1,-1)); + } + } + } + return shifts; + } + + int paddingDepth() const override{ return 2; } + }; + + //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu + //staple: output staple for each mu, summed over nu != mu (Nd) + //U_padded: the gauge link fields padded out using the PaddedCell class + //Cell: the padded cell class + static void RectStaplePaddedAll(std::vector &staple, const std::vector &U_padded, const PaddedCell &Cell) { + RectStaplePaddedAllWorkspace wk; + RectStaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell)); + } + + //Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu + //staple: output staple for each mu, summed over nu != mu (Nd) + //U_padded: the gauge link fields padded out using the PaddedCell class + //Cell: the padded cell class + //gStencil: the stencil + static void RectStaplePaddedAll(std::vector &staple, const std::vector &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) { + double t0 = usecond(); + assert(U_padded.size() == Nd); assert(staple.size() == Nd); + assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back()); + assert(Cell.depth >= 2); + GridBase *ggrid = U_padded[0].Grid(); //padded cell grid + + size_t nshift = gStencil._npoints; + int mu_off_delta = nshift / Nd; + + //Open views to padded gauge links and keep open over mu loop + typedef LatticeView GaugeViewType; + size_t vsize = Nd*sizeof(GaugeViewType); + GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize); + for(int i=0;ioSites(), ggrid->Nsimd(), { + decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; + stencil_ss = Zero(); + int s=offset; + for(int nu=0;nu_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) + e = gStencil_v.GetEntry(s++,ss); + U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + } + } + coalescedWrite(gStaple_v[ss],stencil_ss); + } + ); + offset += mu_off_delta; + }//kernel/view scope + + staple[mu] = Cell.Extract(gStaple); + }//mu loop + + for(int i=0;iaddStencil(new StaplePaddedAllWorkspace); + this->addStencil(new RectStaplePaddedAllWorkspace); + } + }; + + ////////////////////////////////////////////////////// + //Compute the 1x1 and 1x2 staples for all orientations + //Stap : Array of staples (Nd) + //RectStap: Array of rectangular staples (Nd) + //U: Gauge links in each direction (Nd) + ///////////////////////////////////////////////////// + static void StapleAndRectStapleAll(std::vector &Stap, std::vector &RectStap, const std::vector &U){ + StapleAndRectStapleAllWorkspace wk; + StapleAndRectStapleAll(Stap,RectStap,U,wk); + } + + ////////////////////////////////////////////////////// + //Compute the 1x1 and 1x2 staples for all orientations + //Stap : Array of staples (Nd) + //RectStap: Array of rectangular staples (Nd) + //U: Gauge links in each direction (Nd) + //wk: a workspace containing stored PaddedCell and GeneralLocalStencil objects to maximize reuse + ///////////////////////////////////////////////////// + static void StapleAndRectStapleAll(std::vector &Stap, std::vector &RectStap, const std::vector &U, StapleAndRectStapleAllWorkspace &wk){ +#if 0 + StapleAll(Stap, U); + RectStapleAll(RectStap, U); +#else + double t0 = usecond(); + + GridCartesian* unpadded_grid = dynamic_cast(U[0].Grid()); + const PaddedCell &Ghost = wk.getPaddedCell(unpadded_grid); + + CshiftImplGauge cshift_impl; + std::vector U_pad(Nd, Ghost.grids.back()); + for(int mu=0;mu_entries.resize(npoints* osites); this->_entries_p = &_entries[0]; + thread_for(site, osites, { + Coordinate Coor; + Coordinate NbrCoor; - Coordinate Coor; - Coordinate NbrCoor; - for(Integer site=0;siteoCoorFromOindex(Coor,site); - for(int d=0;d_rdimensions[d]; - NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd; + for(Integer ii=0;iioCoorFromOindex(Coor,site); + for(int d=0;d_rdimensions[d]; + NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd; + } + SE._offset = grid->oIndexReduced(NbrCoor); + + //////////////////////////////////////////////// + // Inner index permute calculation + // Simpler version using icoor calculation + //////////////////////////////////////////////// + SE._permute =0; + for(int d=0;d_fdimensions[d]; + int rd = grid->_rdimensions[d]; + int ly = grid->_simd_layout[d]; + + assert((ly==1)||(ly==2)); + + int shift = (shifts[ii][d]+fd)%fd; // make it strictly positive 0.. L-1 + int x = Coor[d]; // x in [0... rd-1] as an oSite + + int permute_dim = grid->PermuteDim(d); + int permute_slice=0; + if(permute_dim){ + int num = shift%rd; // Slice within dest osite cell of slice zero + int wrap = shift/rd; // Number of osite local volume cells crossed through + // x+num < rd dictates whether we are in same permute state as slice 0 + if ( x< rd-num ) permute_slice=wrap; + else permute_slice=(wrap+1)%ly; + } + if ( permute_slice ) { + int ptype =grid->PermuteType(d); + uint8_t mask =0x1<_entries[lex] = SE; } - SE._offset = grid->oIndexReduced(NbrCoor); - - //////////////////////////////////////////////// - // Inner index permute calculation - // Simpler version using icoor calculation - //////////////////////////////////////////////// - SE._permute =0; - for(int d=0;d_fdimensions[d]; - int rd = grid->_rdimensions[d]; - int ly = grid->_simd_layout[d]; - - assert((ly==1)||(ly==2)); - - int shift = (shifts[ii][d]+fd)%fd; // make it strictly positive 0.. L-1 - int x = Coor[d]; // x in [0... rd-1] as an oSite - - int permute_dim = grid->PermuteDim(d); - int permute_slice=0; - if(permute_dim){ - int num = shift%rd; // Slice within dest osite cell of slice zero - int wrap = shift/rd; // Number of osite local volume cells crossed through - // x+num < rd dictates whether we are in same permute state as slice 0 - if ( x< rd-num ) permute_slice=wrap; - else permute_slice=(wrap+1)%ly; - } - if ( permute_slice ) { - int ptype =grid->PermuteType(d); - uint8_t mask =0x1<_entries[lex] = SE; - } - } + }); } }; diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 2b8a7b54..7eb741b1 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -32,6 +32,7 @@ #include // subdir aggregate #include // subdir aggregate +#include ////////////////////////////////////////////////////////////////////////////////////////// // Must not lose sight that goal is to be able to construct really efficient diff --git a/Grid/tensors/Tensor_SIMT.h b/Grid/tensors/Tensor_SIMT.h index 8015d74c..9be2c500 100644 --- a/Grid/tensors/Tensor_SIMT.h +++ b/Grid/tensors/Tensor_SIMT.h @@ -73,6 +73,16 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int return vec; } } +//'perm_mask' acts as a bitmask +template accelerator_inline +vobj coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=0) +{ + auto obj = vec, tmp = vec; + for (int d=0;d accelerator_inline void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0) { @@ -83,7 +93,7 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ { vstream(vec, extracted); } -#else +#else //==GRID_SIMT //#ifndef GRID_SYCL @@ -166,6 +176,14 @@ typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec, return extractLane(plane,vec); } template accelerator_inline +typename vobj::scalar_object coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=acceleratorSIMTlane(vobj::Nsimd())) +{ + int plane = lane; + for (int d=0;d> (d + 1)) : plane; + return extractLane(plane,vec); +} +template accelerator_inline void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd())) { insertLane(lane,vec,extracted); diff --git a/tests/debug/Test_iwasaki_action_newstaple.cc b/tests/debug/Test_iwasaki_action_newstaple.cc new file mode 100644 index 00000000..06bdaadf --- /dev/null +++ b/tests/debug/Test_iwasaki_action_newstaple.cc @@ -0,0 +1,188 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_iwasaki_action_newstaple.cc + + Copyright (C) 2015 + +Author: Christopher Kelly +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace std; +using namespace Grid; + +//////////////////////////////////////////////////////////////////////// +// PlaqPlusRectangleActoin +//////////////////////////////////////////////////////////////////////// +template +class PlaqPlusRectangleActionOrig : public Action { +public: + + INHERIT_GIMPL_TYPES(Gimpl); + +private: + RealD c_plaq; + RealD c_rect; + +public: + PlaqPlusRectangleActionOrig(RealD b,RealD c): c_plaq(b),c_rect(c){}; + + virtual std::string action_name(){return "PlaqPlusRectangleActionOrig";} + + virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {}; // noop as no pseudoferms + + virtual std::string LogParameters(){ + std::stringstream sstream; + sstream << GridLogMessage << "["<gSites(); + + RealD plaq = WilsonLoops::avgPlaquette(U); + RealD rect = WilsonLoops::avgRectangle(U); + + RealD action=c_plaq*(1.0 -plaq)*(Nd*(Nd-1.0))*vol*0.5 + +c_rect*(1.0 -rect)*(Nd*(Nd-1.0))*vol; + + return action; + }; + + virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) { + //extend Ta to include Lorentz indexes + RealD factor_p = c_plaq/RealD(Nc)*0.5; + RealD factor_r = c_rect/RealD(Nc)*0.5; + + GridBase *grid = Umu.Grid(); + + std::vector U (Nd,grid); + std::vector U2(Nd,grid); + + for(int mu=0;mu(Umu,mu); + WilsonLoops::RectStapleDouble(U2[mu],U[mu],mu); + } + + GaugeLinkField dSdU_mu(grid); + GaugeLinkField staple(grid); + + for (int mu=0; mu < Nd; mu++){ + + // Staple in direction mu + + WilsonLoops::Staple(staple,Umu,mu); + + dSdU_mu = Ta(U[mu]*staple)*factor_p; + + WilsonLoops::RectStaple(Umu,staple,U2,U,mu); + + dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r; + + PokeIndex(dSdU, dSdU_mu, mu); + } + + }; + +}; + +// Convenience for common physically defined cases. +// +// RBC c1 parameterisation is not really RBC but don't have good +// reference and we are happy to change name if prior use of this plaq coeff +// parameterisation is made known to us. +template +class RBCGaugeActionOrig : public PlaqPlusRectangleActionOrig { +public: + INHERIT_GIMPL_TYPES(Gimpl); + RBCGaugeActionOrig(RealD beta,RealD c1) : PlaqPlusRectangleActionOrig(beta*(1.0-8.0*c1), beta*c1) {}; + virtual std::string action_name(){return "RBCGaugeActionOrig";} +}; + +template +class IwasakiGaugeActionOrig : public RBCGaugeActionOrig { +public: + INHERIT_GIMPL_TYPES(Gimpl); + IwasakiGaugeActionOrig(RealD beta) : RBCGaugeActionOrig(beta,-0.331) {}; + virtual std::string action_name(){return "IwasakiGaugeActionOrig";} +}; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + std::cout << " mpi "<({45,12,81,9})); + LatticeGaugeField U(&GRID); + + SU::HotConfiguration(pRNG,U); + + //#define PRD +#ifdef PRD + typedef PeriodicGimplD Gimpl; +#else + typedef ConjugateGimplD Gimpl; + std::vector conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1; + Gimpl::setDirections(conj_dirs); +#endif + + typedef typename WilsonLoops::GaugeMat GaugeMat; + typedef typename WilsonLoops::GaugeLorentz GaugeLorentz; + + GaugeLorentz derivOrig(&GRID), derivNew(&GRID); + double beta = 2.13; + IwasakiGaugeActionOrig action_orig(beta); + IwasakiGaugeAction action_new(beta); + + double torig=0, tnew=0; + int ntest = 10; + for(int i=0;i +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include + +using namespace std; +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + Coordinate latt_size = GridDefaultLatt(); + Coordinate simd_layout= GridDefaultSimd(Nd,vComplexD::Nsimd()); + Coordinate mpi_layout = GridDefaultMpi(); + std::cout << " mpi "<({45,12,81,9})); + LatticeGaugeField U(&GRID); + + SU::HotConfiguration(pRNG,U); + + //#define PRD +#ifdef PRD + typedef PeriodicGimplD Gimpl; +#else + typedef ConjugateGimplD Gimpl; + std::vector conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1; + Gimpl::setDirections(conj_dirs); +#endif + + typedef typename WilsonLoops::GaugeMat GaugeMat; + typedef typename WilsonLoops::GaugeLorentz GaugeLorentz; + + int count = 0; + double torig=0, topt=0; + + std::vector Umu(Nd,&GRID), U2(Nd,&GRID); + for(int mu=0;mu(U,mu); + WilsonLoops::RectStapleDouble(U2[mu], Umu[mu], mu); + } + + std::cout << GridLogMessage << "Checking optimized vs unoptimized RectStaple" << std::endl; + for(int mu=0;mu::RectStapleUnoptimised(staple_orig,U,mu); + double t1 = usecond(); + WilsonLoops::RectStapleOptimised(staple_opt, U2, Umu, mu); + double t2 = usecond(); + torig += t1-t0; topt += t2-t1; + ++count; + + GaugeMat diff = staple_orig - staple_opt; + double n = norm2(diff); + std::cout << GridLogMessage << mu << " " << n << std::endl; + assert(n<1e-10); + } + std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms, optimized: " << topt/1000/count << "ms" << std::endl; + + Grid_finalize(); +} diff --git a/tests/debug/Test_padded_cell_staple.cc b/tests/debug/Test_padded_cell_staple.cc new file mode 100644 index 00000000..6d7bc8c9 --- /dev/null +++ b/tests/debug/Test_padded_cell_staple.cc @@ -0,0 +1,580 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_padded_cell_staple.cc + + Copyright (C) 2015 + +Author: Christopher Kelly +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include +#include + +using namespace std; +using namespace Grid; + +template class WilsonLoopsTest : public Gimpl { +public: + INHERIT_GIMPL_TYPES(Gimpl); + + typedef typename Gimpl::GaugeLinkField GaugeMat; + typedef typename Gimpl::GaugeField GaugeLorentz; + + + //Original implementation + static void StapleOrig(GaugeMat &staple, const GaugeLorentz &Umu, int mu, + int nu) { + + GridBase *grid = Umu.Grid(); + + std::vector U(Nd, grid); + for (int d = 0; d < Nd; d++) { + U[d] = PeekIndex(Umu, d); + } + staple = Zero(); + + if (nu != mu) { + + // mu + // ^ + // |__> nu + + // __ + // | + // __| + // + + //Forward: Out(x) = Link(x)*field(x+mu) + //Backward: Out(x) = Link^dag(x-mu)*field(x-mu) + //ShiftStaple: Link(x) = Link(x+mu) + + //tmp1 = U^dag_nu(x-nu) + //tmp2 = U^dag_mu(x-mu) tmp1(x-mu) = U^dag_mu(x-mu) U^dag_nu(x-nu-mu) + //tmp3 = U_nu(x) tmp2(x+nu) = U_nu(x)U^dag_mu(x-mu+nu) U^dag_nu(x-mu) + //tmp4 = tmp(x+mu) = U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x) + + staple += Gimpl::ShiftStaple( + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))), + mu); + + // __ + // | + // |__ + // + // + + //tmp1 = U_mu^dag(x-mu) U_nu(x-mu) + //tmp2 = U_nu^dag(x-nu) tmp1(x-nu) = U_nu^dag(x-nu) U_mu^dag(x-mu-nu) U_nu(x-mu-nu) + //tmp3 = tmp2(x+mu) = U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu) + staple += Gimpl::ShiftStaple( + Gimpl::CovShiftBackward(U[nu], nu, + Gimpl::CovShiftBackward(U[mu], mu, U[nu])), + mu); + } + } + + static void StaplePadded(GaugeMat &staple, const GaugeLorentz &U, int mu, + int nu) { + if(nu==mu){ + staple = Zero(); + return; + } + double peek = 0, construct = 0, exchange = 0, coord = 0, stencil =0, kernel = 0, extract = 0, total = 0; + + double tstart = usecond(); + double t=tstart; + + PaddedCell Ghost(1, (GridCartesian*)U.Grid()); + + construct += usecond() - t; + + t=usecond(); + GaugeMat U_mu = PeekIndex(U, mu); + GaugeMat U_nu = PeekIndex(U, nu); + peek += usecond() - t; + + t=usecond(); + CshiftImplGauge cshift_impl; + GaugeMat Ug_mu = Ghost.Exchange(U_mu, cshift_impl); + GaugeMat Ug_nu = Ghost.Exchange(U_nu, cshift_impl); + exchange += usecond() - t; + + GridBase *ggrid = Ug_mu.Grid(); + + GaugeMat gStaple(ggrid); + + t=usecond(); + Coordinate shift_0(Nd,0); + Coordinate shift_mu(Nd,0); shift_mu[mu]=1; + Coordinate shift_nu(Nd,0); shift_nu[nu]=1; + Coordinate shift_mnu(Nd,0); shift_mnu[nu]=-1; + Coordinate shift_mnu_pmu(Nd,0); shift_mnu_pmu[nu]=-1; shift_mnu_pmu[mu]=1; + + std::vector shifts; + + //U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x) + shifts.push_back(shift_0); + shifts.push_back(shift_nu); + shifts.push_back(shift_mu); + + //U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu) + shifts.push_back(shift_mnu); + shifts.push_back(shift_mnu); + shifts.push_back(shift_mnu_pmu); + coord += usecond()-t; + + t=usecond(); + GeneralLocalStencil gStencil(ggrid,shifts); + stencil += usecond() -t; + + t=usecond(); + { + autoView( gStaple_v , gStaple, AcceleratorWrite); + auto gStencil_v = gStencil.View(); + autoView( Ug_mu_v , Ug_mu, AcceleratorRead); + autoView( Ug_nu_v , Ug_nu, AcceleratorRead); + + accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), { + GeneralStencilEntry const* e = gStencil_v.GetEntry(0,ss); + auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(1,ss); + auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(2,ss); + auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); + + auto stencil_ss = U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x; + + e = gStencil_v.GetEntry(3,ss); + auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(4,ss); + auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(5,ss); + auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd)); + + stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu; + + coalescedWrite(gStaple_v[ss],stencil_ss); + } + ); + } //ensure views are all closed! + kernel += usecond() - t; + + t=usecond(); + staple = Ghost.Extract(gStaple); + extract += usecond()-t; + + total += usecond() - tstart; + std::cout << GridLogMessage << "StaplePadded timings peek:" << peek << " construct:" << construct << " exchange:" << exchange << " coord:" << coord << " stencil:" << stencil << " kernel:" << kernel << " extract:" << extract << " total:" << total << std::endl; + } + + static void RectStapleOrig(GaugeMat &Stap, const GaugeLorentz &Umu, + int mu) { + GridBase *grid = Umu.Grid(); + + std::vector U(Nd, grid); + for (int d = 0; d < Nd; d++) { + U[d] = PeekIndex(Umu, d); + } + + Stap = Zero(); + + for (int nu = 0; nu < Nd; nu++) { + if (nu != mu) { + // __ ___ + // | __ | + // + //tmp1 = U_nu^dag(x-nu) + //tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu) U_nu^dag(x-nu-mu) + //tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu) U_mu^dag(x-2mu) U_nu^dag(x-nu-2mu) + //tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu) U_mu^dag(x-2mu+nu) U_nu^dag(x-2mu) + //tmp5 = U_mu(x)tmp4(x+mu) = U_mu(x)U_nu(x+mu)U_mu^dag(x+nu) U_mu^dag(x-mu+nu) U_nu^dag(x-mu) + //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x) + + Stap += Gimpl::ShiftStaple( + Gimpl::CovShiftForward( + U[mu], mu, + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftIdentityBackward(U[nu], nu))))), + mu); + + // __ + // |__ __ | + + //tmp1 = U^dag_mu(x-mu)U_nu(x-mu) + //tmp2 = U^dag_mu(x-mu)tmp1(x-mu) = U^dag_mu(x-mu)U^dag_mu(x-2mu)U_nu(x-2mu) + //tmp3 = U^dag_nu(x-nu)tmp2(x-nu) = U^dag_nu(x-nu)U^dag_mu(x-mu-nu)U^dag_mu(x-2mu-nu)U_nu(x-2mu-nu) + //tmp4 = U_mu(x)tmp3(x+mu) = U_mu(x)U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu) + //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) + + Stap += Gimpl::ShiftStaple( + Gimpl::CovShiftForward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftBackward(U[mu], mu, U[nu])))), + mu); + + // __ + // |__ __ | + //Forward: Out(x) = Link(x)*field(x+mu) + //Backward: Out(x) = Link^dag(x-mu)*field(x-mu) + //ShiftStaple: Link(x) = Link(x+mu) + + //tmp1 = U_nu(x)U_mu(x+nu) + //tmp2 = U^dag_mu(x-mu)tmp1(x-mu) = U^dag_mu(x-mu)U_nu(x-mu)U_mu(x+nu-mu) + //tmp3 = U^dag_mu(x-mu)tmp2(x-mu) = U^dag_mu(x-mu)U^dag_mu(x-2mu)U_nu(x-2mu)U_mu(x+nu-2mu) + //tmp4 = U^dag_nu(x-nu)tmp3(x-nu) = U^dag_nu(x-nu)U^dag_mu(x-mu-nu)U^dag_mu(x-2mu-nu)U_nu(x-2mu-nu)U_mu(x-2mu) + //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) + Stap += Gimpl::ShiftStaple( + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[mu])))), + mu); + + // __ ___ + // |__ | + //tmp1 = U_nu^dag(x-nu)U_mu(x-nu) + //tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu)U_nu^dag(x-mu-nu)U_mu(x-mu-nu) + //tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu)U_mu^dag(x-2mu)U_nu^dag(x-2mu-nu)U_mu(x-2mu-nu) + //tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu)U_mu^dag(x-2mu+nu)U_nu^dag(x-2mu)U_mu(x-2mu) + //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) + Stap += Gimpl::ShiftStaple( + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftBackward(U[nu], nu, U[mu])))), + mu); + + // -- + // | | + // + // | | + //tmp1 = U_nu^dag(x-nu) + //tmp2 = U_nu^dag(x-nu)tmp1(x-nu) = U_nu^dag(x-nu)U_nu^dag(x-2nu) + //tmp3 = U_mu^dag(x-mu)tmp2(x-mu) = U_mu^dag(x-mu)U_nu^dag(x-mu-nu)U_nu^dag(x-mu-2nu) + //tmp4 = U_nu(x)tmp3(x+nu) = U_nu(x)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_nu^dag(x-mu-nu) + //tmp5 = U_nu(x)tmp4(x+nu) = U_nu(x)U_nu(x+nu)U_mu^dag(x-mu+2nu)U_nu^dag(x-mu+nu)U_nu^dag(x-mu) + //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) + Stap += Gimpl::ShiftStaple( + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftForward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftIdentityBackward(U[nu], nu))))), + mu); + + // | | + // + // | | + // -- + //tmp1 = U_nu(x)U_nu(x+nu) + //tmp2 = U_mu^dag(x-mu)tmp1(x-mu) = U_mu^dag(x-mu)U_nu(x-mu)U_nu(x-mu+nu) + //tmp3 = U_nu^dag(x-nu)tmp2(x-nu) = U_nu^dag(x-nu)U_mu^dag(x-mu-nu)U_nu(x-mu-nu)U_nu(x-mu) + //tmp4 = U_nu^dag(x-nu)tmp3(x-nu) = U_nu^dag(x-nu)U_nu^dag(x-2nu)U_mu^dag(x-mu-2nu)U_nu(x-mu-2nu)U_nu(x-mu-nu) + //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) + Stap += Gimpl::ShiftStaple( + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[nu], nu, + Gimpl::CovShiftBackward( + U[mu], mu, Gimpl::CovShiftForward(U[nu], nu, U[nu])))), + mu); + } + } + } + + + static void RectStaplePadded(GaugeMat &Stap, const GaugeLorentz &U, + int mu) { + PaddedCell Ghost(2,(GridCartesian*)U.Grid()); + GridBase *ggrid = Ghost.grids.back(); + + CshiftImplGauge cshift_impl; + std::vector Ug_dirs(Nd,ggrid); + for(int i=0;i(U, i), cshift_impl); + + GaugeMat gStaple(ggrid); + + std::vector shifts; + for (int nu = 0; nu < Nd; nu++) { + if (nu != mu) { + auto genShift = [&](int mushift,int nushift){ + Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out; + }; + + //tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x) + shifts.push_back(genShift(0,0)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(+1,+1)); + shifts.push_back(genShift(+2,0)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(+1,-1)); + shifts.push_back(genShift(+2,-1)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,-1)); + shifts.push_back(genShift(-1,-1)); + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(+1,-1)); + + //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,0)); + shifts.push_back(genShift(-1,+1)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(+1,0)); + + //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) + shifts.push_back(genShift(0,0)); + shifts.push_back(genShift(0,+1)); + shifts.push_back(genShift(0,+2)); + shifts.push_back(genShift(+1,+1)); + shifts.push_back(genShift(+1,0)); + + //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) + shifts.push_back(genShift(0,-1)); + shifts.push_back(genShift(0,-2)); + shifts.push_back(genShift(0,-2)); + shifts.push_back(genShift(+1,-2)); + shifts.push_back(genShift(+1,-1)); + } + } + size_t nshift = shifts.size(); + + GeneralLocalStencil gStencil(ggrid,shifts); + { + autoView( gStaple_v , gStaple, AcceleratorWrite); + auto gStencil_v = gStencil.View(); + + typedef LatticeView GaugeViewType; + size_t vsize = Nd*sizeof(GaugeViewType); + GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize); + for(int i=0;ioSites(), ggrid->Nsimd(), { + decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss; + stencil_ss = Zero(); + int s=0; + for(int nu=0;nu_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x) + e = gStencil_v.GetEntry(s++,ss); + U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + //tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu) + e = gStencil_v.GetEntry(s++,ss); + U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd); + e = gStencil_v.GetEntry(s++,ss); + U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + e = gStencil_v.GetEntry(s++,ss); + U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd)); + + stencil_ss = stencil_ss + U4*U3*U2*U1*U0; + + } + } + assert(s==nshift); + coalescedWrite(gStaple_v[ss],stencil_ss); + } + ); + + for(int i=0;i({45,12,81,9})); + LatticeGaugeField U(&GRID); + + SU::HotConfiguration(pRNG,U); + + //typedef PeriodicGimplD Gimpl; + typedef ConjugateGimplD Gimpl; + std::vector conj_dirs(Nd,0); conj_dirs[0]=1; conj_dirs[3]=1; + Gimpl::setDirections(conj_dirs); + + typedef typename WilsonLoopsTest::GaugeMat GaugeMat; + typedef typename WilsonLoopsTest::GaugeLorentz GaugeLorentz; + + std::cout << GridLogMessage << "Checking Staple" << std::endl; + int count = 0; + double torig=0, tpadded=0; + + for(int mu=0;mu::StapleOrig(staple_orig,U,mu,nu); + double t1 = usecond(); + WilsonLoopsTest::StaplePadded(staple_padded,U,mu,nu); + double t2 = usecond(); + torig += t1-t0; tpadded += t2-t1; + ++count; + + GaugeMat diff = staple_orig - staple_padded; + double n = norm2(diff); + std::cout << GridLogMessage << mu << " " << nu << " " << n << std::endl; + assert(n<1e-10); + } + } + } + std::cout << GridLogMessage << "Staple timings orig: " << torig/1000/count << "ms, padded: " << tpadded/1000/count << "ms" << std::endl; + count=0; torig=tpadded=0; + + std::cout << GridLogMessage << "Checking RectStaple" << std::endl; + for(int mu=0;mu::RectStapleOrig(staple_orig,U,mu); + double t1 = usecond(); + WilsonLoopsTest::RectStaplePadded(staple_padded,U,mu); + double t2 = usecond(); + torig += t1-t0; tpadded += t2-t1; + ++count; + + GaugeMat diff = staple_orig - staple_padded; + double n = norm2(diff); + std::cout << GridLogMessage << mu << " " << n << std::endl; + assert(n<1e-10); + } + std::cout << GridLogMessage << "RectStaple timings orig: " << torig/1000/count << "ms, padded: " << tpadded/1000/count << "ms" << std::endl; + + Grid_finalize(); +}